From ca4f34d24833540127d2d66f2267ab17559e9b45 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 14 Jun 2026 19:13:43 -0700 Subject: [PATCH 1/3] feat(context-dev): add Context.dev web + brand data integration --- apps/docs/components/icons.tsx | 11 + apps/docs/components/ui/icon-mapping.ts | 2 + .../docs/en/integrations/context_dev.mdx | 306 +++++++++ .../content/docs/en/integrations/meta.json | 1 + apps/sim/blocks/blocks/context_dev.ts | 634 ++++++++++++++++++ apps/sim/blocks/registry.ts | 3 + apps/sim/components/icons.tsx | 11 + apps/sim/lib/integrations/icon-mapping.ts | 2 + apps/sim/lib/integrations/integrations.json | 61 +- apps/sim/tools/context_dev/classify_naics.ts | 96 +++ apps/sim/tools/context_dev/classify_sic.ts | 117 ++++ apps/sim/tools/context_dev/crawl.ts | 144 ++++ apps/sim/tools/context_dev/extract.ts | 135 ++++ apps/sim/tools/context_dev/get_brand.ts | 116 ++++ apps/sim/tools/context_dev/index.ts | 10 + apps/sim/tools/context_dev/map.ts | 90 +++ apps/sim/tools/context_dev/scrape_html.ts | 106 +++ apps/sim/tools/context_dev/scrape_markdown.ts | 115 ++++ apps/sim/tools/context_dev/screenshot.ts | 138 ++++ apps/sim/tools/context_dev/search.ts | 108 +++ apps/sim/tools/context_dev/types.ts | 228 +++++++ apps/sim/tools/context_dev/utils.ts | 80 +++ apps/sim/tools/registry.ts | 22 + 23 files changed, 2535 insertions(+), 1 deletion(-) create mode 100644 apps/docs/content/docs/en/integrations/context_dev.mdx create mode 100644 apps/sim/blocks/blocks/context_dev.ts create mode 100644 apps/sim/tools/context_dev/classify_naics.ts create mode 100644 apps/sim/tools/context_dev/classify_sic.ts create mode 100644 apps/sim/tools/context_dev/crawl.ts create mode 100644 apps/sim/tools/context_dev/extract.ts create mode 100644 apps/sim/tools/context_dev/get_brand.ts create mode 100644 apps/sim/tools/context_dev/index.ts create mode 100644 apps/sim/tools/context_dev/map.ts create mode 100644 apps/sim/tools/context_dev/scrape_html.ts create mode 100644 apps/sim/tools/context_dev/scrape_markdown.ts create mode 100644 apps/sim/tools/context_dev/screenshot.ts create mode 100644 apps/sim/tools/context_dev/search.ts create mode 100644 apps/sim/tools/context_dev/types.ts create mode 100644 apps/sim/tools/context_dev/utils.ts diff --git a/apps/docs/components/icons.tsx b/apps/docs/components/icons.tsx index 8ea1529b1e8..dc3c91c5bef 100644 --- a/apps/docs/components/icons.tsx +++ b/apps/docs/components/icons.tsx @@ -2053,6 +2053,17 @@ export function ConfluenceIcon(props: SVGProps) { ) } +export function ContextDevIcon(props: SVGProps) { + return ( + + + + ) +} + export function ConvexIcon(props: SVGProps) { return ( = { codepipeline: CodePipelineIcon, confluence: ConfluenceIcon, confluence_v2: ConfluenceIcon, + context_dev: ContextDevIcon, convex: ConvexIcon, crowdstrike: CrowdStrikeIcon, cursor: CursorIcon, diff --git a/apps/docs/content/docs/en/integrations/context_dev.mdx b/apps/docs/content/docs/en/integrations/context_dev.mdx new file mode 100644 index 00000000000..7e02f980ccd --- /dev/null +++ b/apps/docs/content/docs/en/integrations/context_dev.mdx @@ -0,0 +1,306 @@ +--- +title: Context.dev +description: Scrape, crawl, search, extract, and enrich web and brand data +--- + +import { BlockInfoCard } from "@/components/ui/block-info-card" + + + +## Usage Instructions + +Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, crawl entire sites, map sitemaps, search the web, extract structured data, classify industries, and retrieve brand assets — all from one API. + + + +## Actions + +### `context_dev_scrape_markdown` + +Scrape any URL and return clean, LLM-ready markdown content. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The full URL to scrape \(must include http:// or https://\) | +| `useMainContentOnly` | boolean | No | Return only main content, excluding headers, footers, and navigation | +| `includeLinks` | boolean | No | Preserve hyperlinks in the markdown output \(default: true\) | +| `includeImages` | boolean | No | Include image references in the markdown output \(default: false\) | +| `includeFrames` | boolean | No | Render iframe contents inline \(default: false\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 86400000\) | +| `waitForMs` | number | No | Browser wait time after page load in milliseconds \(0-30000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `markdown` | string | Page content as clean markdown | +| `url` | string | The scraped URL | + +### `context_dev_scrape_html` + +Scrape any URL and return the raw HTML content of the page. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The full URL to scrape \(must include http:// or https://\) | +| `useMainContentOnly` | boolean | No | Return only main content, excluding headers, footers, and navigation | +| `includeFrames` | boolean | No | Render iframe contents inline into the returned HTML \(default: false\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 86400000\) | +| `waitForMs` | number | No | Browser wait time after page load in milliseconds \(0-30000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `html` | string | Raw HTML content of the page | +| `url` | string | The scraped URL | +| `type` | string | Detected content type \(html, xml, json, text, csv, markdown, svg, pdf\) | + +### `context_dev_screenshot` + +Capture a screenshot of any web page and store it as a downloadable image file. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The full URL to capture \(must include http:// or https://\) | +| `fullScreenshot` | boolean | No | Capture the full scrollable page instead of just the viewport \(default: false\) | +| `handleCookiePopup` | boolean | No | Attempt to dismiss cookie banners before capturing \(default: false\) | +| `viewportWidth` | number | No | Viewport width in pixels \(240-7680, default: 1920\) | +| `viewportHeight` | number | No | Viewport height in pixels \(240-4320, default: 1080\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 86400000\) | +| `waitForMs` | number | No | Post-load delay before capturing in milliseconds \(0-30000, default: 3000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `file` | file | Stored screenshot image file | +| `screenshotUrl` | string | Public URL of the captured screenshot | +| `screenshotType` | string | Screenshot type \(viewport or fullPage\) | +| `domain` | string | Domain that was captured | +| `width` | number | Screenshot width in pixels | +| `height` | number | Screenshot height in pixels | + +### `context_dev_crawl` + +Crawl an entire website and return each discovered page as clean markdown. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The starting URL to crawl \(must include http:// or https://\) | +| `maxPages` | number | No | Maximum number of pages to crawl \(1-500, default: 100\) | +| `maxDepth` | number | No | Maximum link depth from the starting URL \(0 = start page only\) | +| `urlRegex` | string | No | Regex pattern to filter which URLs are crawled | +| `includeLinks` | boolean | No | Preserve hyperlinks in the markdown output \(default: true\) | +| `includeImages` | boolean | No | Include image references in the markdown output \(default: false\) | +| `useMainContentOnly` | boolean | No | Strip headers, footers, and sidebars from each page \(default: false\) | +| `followSubdomains` | boolean | No | Follow links to subdomains of the starting domain \(default: false\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 86400000\) | +| `waitForMs` | number | No | Browser wait time after page load in milliseconds \(0-30000\) | +| `stopAfterMs` | number | No | Soft crawl time budget in milliseconds \(10000-110000, default: 80000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `results` | array | Crawled pages with markdown content and per-page metadata | +| ↳ `markdown` | string | Page content as markdown | +| ↳ `metadata` | json | Page metadata \(url, title, crawlDepth, statusCode\) | +| `metadata` | object | Crawl summary \(numUrls, maxCrawlDepth, numSucceeded, numFailed, numSkipped\) | + +### `context_dev_map` + +Build a sitemap of a domain and return every discovered page URL. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to build a sitemap for \(e.g., "example.com"\) | +| `maxLinks` | number | No | Maximum number of URLs to return \(1-100000, default: 10000\) | +| `urlRegex` | string | No | RE2-compatible regex to filter URLs \(max 256 chars\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `domain` | string | The domain that was mapped | +| `urls` | array | All page URLs discovered from the sitemap | +| `meta` | object | Sitemap discovery stats \(sitemapsDiscovered, sitemapsFetched, errors\) | + +### `context_dev_search` + +Search the web with natural language and optionally scrape results to markdown. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `query` | string | Yes | The natural language search query \(1-500 characters\) | +| `includeDomains` | array | No | Only return results from these domains | +| `excludeDomains` | array | No | Exclude results from these domains | +| `freshness` | string | No | Recency filter \(last_24_hours, last_week, last_month, last_year\) | +| `queryFanout` | boolean | No | Expand the query into parallel variants for broader coverage | +| `markdownEnabled` | boolean | No | Scrape each result page to markdown \(default: false\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `results` | array | Search results with url, title, description, relevance, and optional markdown | +| ↳ `url` | string | Result page URL | +| ↳ `title` | string | Result page title | +| ↳ `description` | string | Result snippet/description | +| ↳ `relevance` | string | Relevance rating \(high, medium, low\) | +| ↳ `markdown` | json | Scraped markdown for the result \(when markdown scraping is enabled\) | +| `query` | string | The query that was searched | + +### `context_dev_extract` + +Crawl a website and extract structured data matching a provided JSON schema. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The starting website URL \(must include http:// or https://\) | +| `schema` | json | Yes | JSON Schema describing the structure of the data to extract | +| `instructions` | string | No | Optional extraction guidance for link prioritization \(max 2000 chars\) | +| `factCheck` | boolean | No | Require extracted values to be grounded in page facts \(default: false\) | +| `followSubdomains` | boolean | No | Follow links on subdomains of the starting domain \(default: false\) | +| `maxPages` | number | No | Maximum number of pages to analyze \(1-50, default: 5\) | +| `maxDepth` | number | No | Maximum link depth from the starting URL | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 604800000\) | +| `stopAfterMs` | number | No | Soft crawl time budget in milliseconds \(10000-110000, default: 80000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Extraction status | +| `url` | string | The starting URL that was crawled | +| `urlsAnalyzed` | array | URLs that were analyzed during extraction | +| `data` | json | Structured data matching the requested schema | +| `metadata` | object | Crawl summary \(numUrls, maxCrawlDepth, numSucceeded, numFailed, numSkipped\) | + +### `context_dev_classify_naics` + +Classify a brand into NAICS industry codes from its domain or company name. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `input` | string | Yes | Brand domain or company name to classify \(e.g., "stripe.com" or "Stripe"\) | +| `minResults` | number | No | Minimum number of codes to return \(1-10, default: 1\) | +| `maxResults` | number | No | Maximum number of codes to return \(1-10, default: 5\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Classification status | +| `domain` | string | Resolved domain | +| `type` | string | Input type that was resolved | +| `codes` | array | Matched NAICS codes with name and confidence | +| ↳ `code` | string | Industry code | +| ↳ `name` | string | Industry name | +| ↳ `confidence` | string | Match confidence \(high, medium, low\) | + +### `context_dev_classify_sic` + +Classify a brand into SIC industry codes from its domain or company name. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `input` | string | Yes | Brand domain or company name to classify \(e.g., "stripe.com" or "Stripe"\) | +| `type` | string | No | SIC taxonomy version: "original_sic" \(default\) or "latest_sec" | +| `minResults` | number | No | Minimum number of codes to return \(1-10, default: 1\) | +| `maxResults` | number | No | Maximum number of codes to return \(1-10, default: 5\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Classification status | +| `domain` | string | Resolved domain | +| `type` | string | Input type that was resolved | +| `classification` | string | SIC taxonomy version used \(original_sic or latest_sec\) | +| `codes` | array | Matched SIC codes with name, confidence, and group metadata | +| ↳ `code` | string | Industry code | +| ↳ `name` | string | Industry name | +| ↳ `confidence` | string | Match confidence \(high, medium, low\) | +| ↳ `majorGroup` | string | Major group code \(original_sic only\) | +| ↳ `majorGroupName` | string | Major group name \(original_sic only\) | +| ↳ `office` | string | SEC office \(latest_sec only\) | + +### `context_dev_get_brand` + +Retrieve brand data for a domain: logos, colors, backdrops, socials, address, and industry. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to retrieve brand data for \(e.g., "airbnb.com"\) | +| `forceLanguage` | string | No | Override the detected language with a supported language code | +| `maxSpeed` | boolean | No | Skip time-consuming operations for a faster response \(default: false\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Retrieval status | +| `brand` | object | Brand data object | +| ↳ `domain` | string | Brand domain | +| ↳ `title` | string | Brand title | +| ↳ `description` | string | Brand description | +| ↳ `slogan` | string | Brand slogan | +| ↳ `colors` | json | Brand colors \(hex and name\) | +| ↳ `logos` | json | Brand logos with mode, colors, resolution, and type | +| ↳ `backdrops` | json | Brand backdrop images | +| ↳ `socials` | json | Social media profiles \(type and url\) | +| ↳ `address` | json | Brand address | +| ↳ `stock` | json | Stock info \(ticker and exchange\) | +| ↳ `is_nsfw` | boolean | Whether the brand contains adult content | +| ↳ `email` | string | Brand contact email | +| ↳ `phone` | string | Brand contact phone | +| ↳ `industries` | json | Industry taxonomy \(eic pairs\) | +| ↳ `links` | json | Key brand links \(careers, privacy, terms, etc.\) | +| ↳ `primary_language` | string | Primary language of the brand site | + + diff --git a/apps/docs/content/docs/en/integrations/meta.json b/apps/docs/content/docs/en/integrations/meta.json index e67c39642b8..0ee2e26089d 100644 --- a/apps/docs/content/docs/en/integrations/meta.json +++ b/apps/docs/content/docs/en/integrations/meta.json @@ -35,6 +35,7 @@ "cloudwatch", "codepipeline", "confluence", + "context_dev", "convex", "crowdstrike", "cursor", diff --git a/apps/sim/blocks/blocks/context_dev.ts b/apps/sim/blocks/blocks/context_dev.ts new file mode 100644 index 00000000000..fead0ac24d3 --- /dev/null +++ b/apps/sim/blocks/blocks/context_dev.ts @@ -0,0 +1,634 @@ +import { ContextDevIcon } from '@/components/icons' +import type { BlockConfig, BlockMeta } from '@/blocks/types' +import { AuthMode, IntegrationType } from '@/blocks/types' +import type { ContextDevScrapeMarkdownResponse } from '@/tools/context_dev/types' + +const SCRAPE_OPS = ['scrape_markdown', 'scrape_html', 'screenshot', 'crawl', 'extract'] + +/** + * Coerces a value that may be a number or numeric string into a number, or undefined. + */ +function toNumber(value: unknown): number | undefined { + if (value === undefined || value === null || value === '') return undefined + const parsed = Number(value) + return Number.isNaN(parsed) ? undefined : parsed +} + +/** + * Parses a value that may already be an array or a JSON-encoded array string. + */ +function toStringArray(value: unknown): string[] | undefined { + if (Array.isArray(value)) return value.filter((v): v is string => typeof v === 'string') + if (typeof value === 'string' && value.trim() !== '') { + try { + const parsed = JSON.parse(value) + if (Array.isArray(parsed)) return parsed.filter((v): v is string => typeof v === 'string') + } catch { + return value + .split(',') + .map((v) => v.trim()) + .filter(Boolean) + } + } + return undefined +} + +export const ContextDevBlock: BlockConfig = { + type: 'context_dev', + name: 'Context.dev', + description: 'Scrape, crawl, search, extract, and enrich web and brand data', + authMode: AuthMode.ApiKey, + longDescription: + 'Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, crawl entire sites, map sitemaps, search the web, extract structured data, classify industries, and retrieve brand assets — all from one API.', + docsLink: 'https://docs.sim.ai/integrations/context_dev', + category: 'tools', + integrationType: IntegrationType.Search, + bgColor: '#ffffff', + icon: ContextDevIcon, + subBlocks: [ + { + id: 'operation', + title: 'Operation', + type: 'dropdown', + options: [ + { label: 'Scrape Markdown', id: 'scrape_markdown' }, + { label: 'Scrape HTML', id: 'scrape_html' }, + { label: 'Screenshot', id: 'screenshot' }, + { label: 'Crawl Website', id: 'crawl' }, + { label: 'Map Sitemap', id: 'map' }, + { label: 'Web Search', id: 'search' }, + { label: 'Extract Structured Data', id: 'extract' }, + { label: 'Classify NAICS', id: 'classify_naics' }, + { label: 'Classify SIC', id: 'classify_sic' }, + { label: 'Get Brand Data', id: 'get_brand' }, + ], + value: () => 'scrape_markdown', + }, + { + id: 'url', + title: 'Website URL', + type: 'short-input', + placeholder: 'https://example.com', + condition: { field: 'operation', value: SCRAPE_OPS }, + required: { field: 'operation', value: SCRAPE_OPS }, + }, + { + id: 'domain', + title: 'Domain', + type: 'short-input', + placeholder: 'example.com', + condition: { field: 'operation', value: ['map', 'get_brand'] }, + required: { field: 'operation', value: ['map', 'get_brand'] }, + }, + { + id: 'input', + title: 'Domain or Company Name', + type: 'short-input', + placeholder: 'example.com or Company Name', + condition: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + required: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + }, + { + id: 'query', + title: 'Search Query', + type: 'short-input', + placeholder: 'Enter your search query', + condition: { field: 'operation', value: 'search' }, + required: { field: 'operation', value: 'search' }, + }, + { + id: 'schema', + title: 'Extraction Schema', + type: 'code', + language: 'json', + placeholder: 'Enter a JSON schema describing the data to extract...', + condition: { field: 'operation', value: 'extract' }, + required: { field: 'operation', value: 'extract' }, + wandConfig: { + enabled: true, + maintainHistory: true, + prompt: `You are an expert at writing JSON Schemas for structured web data extraction. +Generate ONLY the JSON schema based on the user's request. +The output MUST be a single, valid JSON object, starting with { and ending with }. +Use standard JSON Schema properties (type, description, properties, items, required). + +Current schema: {context} + +Do not include any explanations, markdown formatting, or other text outside the JSON object.`, + placeholder: 'Describe the data structure you want to extract...', + generationType: 'json-schema', + }, + }, + { + id: 'instructions', + title: 'Instructions', + type: 'long-input', + placeholder: 'Optional guidance for which links to prioritize', + mode: 'advanced', + condition: { field: 'operation', value: 'extract' }, + }, + { + id: 'useMainContentOnly', + title: 'Only Main Content', + type: 'switch', + condition: { field: 'operation', value: ['scrape_markdown', 'scrape_html', 'crawl'] }, + }, + { + id: 'includeLinks', + title: 'Include Links', + type: 'switch', + condition: { field: 'operation', value: ['scrape_markdown', 'crawl'] }, + }, + { + id: 'includeImages', + title: 'Include Images', + type: 'switch', + condition: { field: 'operation', value: ['scrape_markdown', 'crawl'] }, + }, + { + id: 'fullScreenshot', + title: 'Full Page Screenshot', + type: 'switch', + condition: { field: 'operation', value: 'screenshot' }, + }, + { + id: 'handleCookiePopup', + title: 'Dismiss Cookie Popups', + type: 'switch', + condition: { field: 'operation', value: 'screenshot' }, + }, + { + id: 'markdownEnabled', + title: 'Scrape Results to Markdown', + type: 'switch', + condition: { field: 'operation', value: 'search' }, + }, + { + id: 'sicType', + title: 'SIC Taxonomy', + type: 'dropdown', + options: [ + { label: 'Original SIC', id: 'original_sic' }, + { label: 'Latest SEC', id: 'latest_sec' }, + ], + value: () => 'original_sic', + condition: { field: 'operation', value: 'classify_sic' }, + }, + { + id: 'freshness', + title: 'Freshness', + type: 'dropdown', + options: [ + { label: 'Last 24 Hours', id: 'last_24_hours' }, + { label: 'Last Week', id: 'last_week' }, + { label: 'Last Month', id: 'last_month' }, + { label: 'Last Year', id: 'last_year' }, + ], + mode: 'advanced', + condition: { field: 'operation', value: 'search' }, + }, + { + id: 'includeDomains', + title: 'Include Domains', + type: 'long-input', + placeholder: '["example.com", "docs.example.com"]', + mode: 'advanced', + condition: { field: 'operation', value: 'search' }, + }, + { + id: 'excludeDomains', + title: 'Exclude Domains', + type: 'long-input', + placeholder: '["spam.com"]', + mode: 'advanced', + condition: { field: 'operation', value: 'search' }, + }, + { + id: 'queryFanout', + title: 'Query Fan-out', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'search' }, + }, + { + id: 'factCheck', + title: 'Fact Check', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'extract' }, + }, + { + id: 'followSubdomains', + title: 'Follow Subdomains', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: ['crawl', 'extract'] }, + }, + { + id: 'maxPages', + title: 'Max Pages', + type: 'short-input', + placeholder: 'crawl: 100, extract: 5', + mode: 'advanced', + condition: { field: 'operation', value: ['crawl', 'extract'] }, + }, + { + id: 'maxDepth', + title: 'Max Depth', + type: 'short-input', + placeholder: 'Maximum link depth', + mode: 'advanced', + condition: { field: 'operation', value: ['crawl', 'extract'] }, + }, + { + id: 'urlRegex', + title: 'URL Regex', + type: 'short-input', + placeholder: 'Regex to filter URLs', + mode: 'advanced', + condition: { field: 'operation', value: ['crawl', 'map'] }, + }, + { + id: 'maxLinks', + title: 'Max Links', + type: 'short-input', + placeholder: '10000', + mode: 'advanced', + condition: { field: 'operation', value: 'map' }, + }, + { + id: 'viewportWidth', + title: 'Viewport Width', + type: 'short-input', + placeholder: '1920', + mode: 'advanced', + condition: { field: 'operation', value: 'screenshot' }, + }, + { + id: 'viewportHeight', + title: 'Viewport Height', + type: 'short-input', + placeholder: '1080', + mode: 'advanced', + condition: { field: 'operation', value: 'screenshot' }, + }, + { + id: 'minResults', + title: 'Min Results', + type: 'short-input', + placeholder: '1', + mode: 'advanced', + condition: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + }, + { + id: 'maxResults', + title: 'Max Results', + type: 'short-input', + placeholder: '5', + mode: 'advanced', + condition: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + }, + { + id: 'forceLanguage', + title: 'Force Language', + type: 'short-input', + placeholder: 'e.g., en, es, fr', + mode: 'advanced', + condition: { field: 'operation', value: 'get_brand' }, + }, + { + id: 'maxSpeed', + title: 'Max Speed', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'get_brand' }, + }, + { + id: 'waitForMs', + title: 'Wait For (ms)', + type: 'short-input', + placeholder: '0', + mode: 'advanced', + condition: { + field: 'operation', + value: ['scrape_markdown', 'scrape_html', 'screenshot', 'crawl'], + }, + }, + { + id: 'stopAfterMs', + title: 'Stop After (ms)', + type: 'short-input', + placeholder: '80000', + mode: 'advanced', + condition: { field: 'operation', value: ['crawl', 'extract'] }, + }, + { + id: 'maxAgeMs', + title: 'Cache Max Age (ms)', + type: 'short-input', + placeholder: '86400000', + mode: 'advanced', + condition: { + field: 'operation', + value: ['scrape_markdown', 'scrape_html', 'screenshot', 'crawl', 'extract', 'get_brand'], + }, + }, + { + id: 'timeoutMS', + title: 'Timeout (ms)', + type: 'short-input', + placeholder: '60000', + mode: 'advanced', + }, + { + id: 'apiKey', + title: 'API Key', + type: 'short-input', + placeholder: 'Enter your Context.dev API key', + password: true, + required: true, + }, + ], + tools: { + access: [ + 'context_dev_scrape_markdown', + 'context_dev_scrape_html', + 'context_dev_screenshot', + 'context_dev_crawl', + 'context_dev_map', + 'context_dev_search', + 'context_dev_extract', + 'context_dev_classify_naics', + 'context_dev_classify_sic', + 'context_dev_get_brand', + ], + config: { + tool: (params) => + params.operation ? `context_dev_${params.operation}` : 'context_dev_scrape_markdown', + params: (params) => { + const { operation, apiKey } = params + const result: Record = { apiKey } + + const setBool = (key: string) => { + if (params[key] != null) result[key] = params[key] + } + const setNumber = (key: string, target = key) => { + const n = toNumber(params[key]) + if (n !== undefined) result[target] = n + } + const setString = (key: string, target = key) => { + if (params[key]) result[target] = params[key] + } + + switch (operation) { + case 'scrape_markdown': + setString('url') + setBool('useMainContentOnly') + setBool('includeLinks') + setBool('includeImages') + setNumber('maxAgeMs') + setNumber('waitForMs') + setNumber('timeoutMS') + break + case 'scrape_html': + setString('url') + setBool('useMainContentOnly') + setNumber('maxAgeMs') + setNumber('waitForMs') + setNumber('timeoutMS') + break + case 'screenshot': + setString('url') + setBool('fullScreenshot') + setBool('handleCookiePopup') + setNumber('viewportWidth') + setNumber('viewportHeight') + setNumber('maxAgeMs') + setNumber('waitForMs') + setNumber('timeoutMS') + break + case 'crawl': + setString('url') + setNumber('maxPages') + setNumber('maxDepth') + setString('urlRegex') + setBool('useMainContentOnly') + setBool('includeLinks') + setBool('includeImages') + setBool('followSubdomains') + setNumber('maxAgeMs') + setNumber('waitForMs') + setNumber('stopAfterMs') + setNumber('timeoutMS') + break + case 'map': + setString('domain') + setNumber('maxLinks') + setString('urlRegex') + setNumber('timeoutMS') + break + case 'search': { + setString('query') + const include = toStringArray(params.includeDomains) + if (include?.length) result.includeDomains = include + const exclude = toStringArray(params.excludeDomains) + if (exclude?.length) result.excludeDomains = exclude + setString('freshness') + setBool('queryFanout') + setBool('markdownEnabled') + setNumber('timeoutMS') + break + } + case 'extract': { + setString('url') + if (params.schema) { + if (typeof params.schema === 'object') { + result.schema = params.schema + } else if (typeof params.schema === 'string') { + try { + result.schema = JSON.parse(params.schema) + } catch { + throw new Error('Extraction schema must be valid JSON') + } + } + } + setString('instructions') + setBool('factCheck') + setBool('followSubdomains') + setNumber('maxPages') + setNumber('maxDepth') + setNumber('maxAgeMs') + setNumber('stopAfterMs') + setNumber('timeoutMS') + break + } + case 'classify_naics': + setString('input') + setNumber('minResults') + setNumber('maxResults') + setNumber('timeoutMS') + break + case 'classify_sic': + setString('input') + setString('sicType', 'type') + setNumber('minResults') + setNumber('maxResults') + setNumber('timeoutMS') + break + case 'get_brand': + setString('domain') + setString('forceLanguage') + setBool('maxSpeed') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + } + + return result + }, + }, + }, + inputs: { + apiKey: { type: 'string', description: 'Context.dev API key' }, + operation: { type: 'string', description: 'Operation to perform' }, + url: { type: 'string', description: 'Target website URL' }, + domain: { type: 'string', description: 'Target domain' }, + input: { type: 'string', description: 'Domain or company name for classification' }, + query: { type: 'string', description: 'Web search query' }, + schema: { type: 'json', description: 'JSON schema for structured extraction' }, + instructions: { type: 'string', description: 'Extraction guidance' }, + useMainContentOnly: { type: 'boolean', description: 'Return only main content' }, + includeLinks: { type: 'boolean', description: 'Preserve hyperlinks' }, + includeImages: { type: 'boolean', description: 'Include image references' }, + fullScreenshot: { type: 'boolean', description: 'Capture the full page' }, + handleCookiePopup: { type: 'boolean', description: 'Dismiss cookie banners' }, + markdownEnabled: { type: 'boolean', description: 'Scrape search results to markdown' }, + sicType: { type: 'string', description: 'SIC taxonomy version' }, + freshness: { type: 'string', description: 'Search recency filter' }, + includeDomains: { type: 'json', description: 'Domains to allowlist in search' }, + excludeDomains: { type: 'json', description: 'Domains to blocklist in search' }, + queryFanout: { type: 'boolean', description: 'Expand query into variants' }, + factCheck: { type: 'boolean', description: 'Ground extracted values in page facts' }, + followSubdomains: { type: 'boolean', description: 'Follow subdomain links' }, + maxPages: { type: 'number', description: 'Maximum pages to process' }, + maxDepth: { type: 'number', description: 'Maximum link depth' }, + urlRegex: { type: 'string', description: 'Regex to filter URLs' }, + maxLinks: { type: 'number', description: 'Maximum sitemap URLs' }, + viewportWidth: { type: 'number', description: 'Screenshot viewport width' }, + viewportHeight: { type: 'number', description: 'Screenshot viewport height' }, + minResults: { type: 'number', description: 'Minimum classification results' }, + maxResults: { type: 'number', description: 'Maximum classification results' }, + forceLanguage: { type: 'string', description: 'Override detected brand language' }, + maxSpeed: { type: 'boolean', description: 'Skip slow brand operations' }, + waitForMs: { type: 'number', description: 'Browser wait time in ms' }, + stopAfterMs: { type: 'number', description: 'Soft crawl time budget in ms' }, + maxAgeMs: { type: 'number', description: 'Cache max age in ms' }, + timeoutMS: { type: 'number', description: 'Request timeout in ms' }, + }, + outputs: { + markdown: { type: 'string', description: 'Scraped markdown content' }, + html: { type: 'string', description: 'Scraped raw HTML content' }, + type: { type: 'string', description: 'Detected content type or resolved input type' }, + url: { type: 'string', description: 'Resolved target URL' }, + file: { type: 'file', description: 'Stored screenshot image file' }, + screenshotUrl: { type: 'string', description: 'Public URL of the captured screenshot' }, + screenshotType: { type: 'string', description: 'Screenshot type (viewport or fullPage)' }, + domain: { type: 'string', description: 'Resolved domain' }, + width: { type: 'number', description: 'Screenshot width in pixels' }, + height: { type: 'number', description: 'Screenshot height in pixels' }, + results: { type: 'json', description: 'Crawl pages or search results' }, + metadata: { type: 'json', description: 'Crawl or extraction summary metadata' }, + urls: { type: 'json', description: 'Discovered sitemap URLs' }, + meta: { type: 'json', description: 'Sitemap discovery stats' }, + query: { type: 'string', description: 'The query that was searched' }, + status: { type: 'string', description: 'Operation status' }, + urlsAnalyzed: { type: 'json', description: 'URLs analyzed during extraction' }, + data: { type: 'json', description: 'Structured data extracted from the site' }, + codes: { type: 'json', description: 'Matched industry classification codes' }, + classification: { type: 'string', description: 'SIC taxonomy version used' }, + brand: { type: 'json', description: 'Brand data (logos, colors, socials, industry)' }, + creditsConsumed: { type: 'number', description: 'Credits consumed by this request' }, + creditsRemaining: { type: 'number', description: 'Credits remaining on the API key' }, + }, +} + +export const ContextDevBlockMeta = { + tags: ['web-scraping', 'enrichment', 'automation'], + url: 'https://www.context.dev', + templates: [ + { + icon: ContextDevIcon, + title: 'Context.dev knowledge-base builder', + prompt: + 'Build a workflow that maps a documentation site with Context.dev, crawls each page to clean markdown, chunks and embeds the content, and upserts it into a knowledge base for an answering agent.', + modules: ['knowledge-base', 'agent', 'workflows'], + category: 'engineering', + tags: ['research', 'sync'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev competitor monitor', + prompt: + 'Build a scheduled workflow that scrapes competitor pricing and changelog pages to markdown with Context.dev weekly, diffs against the prior snapshot, logs changes to a table, and posts notable updates to Slack.', + modules: ['scheduled', 'tables', 'agent', 'workflows'], + category: 'marketing', + tags: ['marketing', 'monitoring'], + alsoIntegrations: ['slack'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev lead enrichment', + prompt: + 'Create a workflow that takes a work email or domain, uses Context.dev to retrieve brand data and classify the company into NAICS codes, and writes the enriched firmographics to a CRM record.', + modules: ['agent', 'tables', 'workflows'], + category: 'sales', + tags: ['enrichment', 'sales'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev structured data extractor', + prompt: + 'Build a workflow that takes a website URL and a JSON schema, uses Context.dev Extract to pull structured fields across the site, and returns the validated records as JSON.', + modules: ['agent', 'workflows'], + category: 'operations', + tags: ['automation', 'research'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev research brief', + prompt: + 'Create an agent that runs a Context.dev web search on a topic, scrapes the top results to markdown, and synthesizes a cited research brief saved as a file.', + modules: ['agent', 'files', 'workflows'], + category: 'productivity', + tags: ['research'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev brand asset fetcher', + prompt: + 'Build a workflow that takes a domain, uses Context.dev to retrieve the brand logos, colors, and a homepage screenshot, and stores the assets as files for a design handoff.', + modules: ['agent', 'files', 'workflows'], + category: 'marketing', + tags: ['marketing', 'enrichment'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev transaction enrichment', + prompt: + 'Create a workflow that classifies a list of company domains into SIC and NAICS industry codes with Context.dev and appends the codes to a table for downstream reporting.', + modules: ['tables', 'agent', 'workflows'], + category: 'operations', + tags: ['enrichment', 'automation'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev site change watcher', + prompt: + 'Build a scheduled workflow that maps a site sitemap with Context.dev, scrapes new or changed pages to markdown, summarizes the differences, and emails a digest.', + modules: ['scheduled', 'agent', 'workflows'], + category: 'operations', + tags: ['monitoring', 'automation'], + alsoIntegrations: ['gmail'], + }, + ], +} as const satisfies BlockMeta diff --git a/apps/sim/blocks/registry.ts b/apps/sim/blocks/registry.ts index 2cacc4576c6..88f03250471 100644 --- a/apps/sim/blocks/registry.ts +++ b/apps/sim/blocks/registry.ts @@ -38,6 +38,7 @@ import { CloudWatchBlock, CloudWatchBlockMeta } from '@/blocks/blocks/cloudwatch import { CodePipelineBlock, CodePipelineBlockMeta } from '@/blocks/blocks/codepipeline' import { ConditionBlock } from '@/blocks/blocks/condition' import { ConfluenceBlock, ConfluenceBlockMeta, ConfluenceV2Block } from '@/blocks/blocks/confluence' +import { ContextDevBlock, ContextDevBlockMeta } from '@/blocks/blocks/context_dev' import { ConvexBlock, ConvexBlockMeta } from '@/blocks/blocks/convex' import { CredentialBlock } from '@/blocks/blocks/credential' import { CrowdStrikeBlock, CrowdStrikeBlockMeta } from '@/blocks/blocks/crowdstrike' @@ -368,6 +369,7 @@ const BLOCK_REGISTRY: Record = { condition: ConditionBlock, confluence: ConfluenceBlock, confluence_v2: ConfluenceV2Block, + context_dev: ContextDevBlock, convex: ConvexBlock, credential: CredentialBlock, crowdstrike: CrowdStrikeBlock, @@ -667,6 +669,7 @@ const BLOCK_META_REGISTRY: Record = { cloudwatch: CloudWatchBlockMeta, codepipeline: CodePipelineBlockMeta, confluence: ConfluenceBlockMeta, + context_dev: ContextDevBlockMeta, convex: ConvexBlockMeta, crowdstrike: CrowdStrikeBlockMeta, cursor: CursorBlockMeta, diff --git a/apps/sim/components/icons.tsx b/apps/sim/components/icons.tsx index 8ea1529b1e8..dc3c91c5bef 100644 --- a/apps/sim/components/icons.tsx +++ b/apps/sim/components/icons.tsx @@ -2053,6 +2053,17 @@ export function ConfluenceIcon(props: SVGProps) { ) } +export function ContextDevIcon(props: SVGProps) { + return ( + + + + ) +} + export function ConvexIcon(props: SVGProps) { return ( = { cloudwatch: CloudWatchIcon, codepipeline: CodePipelineIcon, confluence_v2: ConfluenceIcon, + context_dev: ContextDevIcon, convex: ConvexIcon, crowdstrike: CrowdStrikeIcon, cursor_v2: CursorIcon, diff --git a/apps/sim/lib/integrations/integrations.json b/apps/sim/lib/integrations/integrations.json index c846bcdaf35..b66f26b25b9 100644 --- a/apps/sim/lib/integrations/integrations.json +++ b/apps/sim/lib/integrations/integrations.json @@ -1,5 +1,5 @@ { - "updatedAt": "2026-06-14", + "updatedAt": "2026-06-15", "integrations": [ { "type": "onepassword", @@ -3336,6 +3336,65 @@ "integrationType": "documents", "tags": ["knowledge-base", "content-management", "note-taking"] }, + { + "type": "context_dev", + "slug": "context-dev", + "name": "Context.dev", + "description": "Scrape, crawl, search, extract, and enrich web and brand data", + "longDescription": "Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, crawl entire sites, map sitemaps, search the web, extract structured data, classify industries, and retrieve brand assets — all from one API.", + "bgColor": "#ffffff", + "iconName": "ContextDevIcon", + "docsUrl": "https://docs.sim.ai/integrations/context_dev", + "operations": [ + { + "name": "Scrape Markdown", + "description": "Scrape any URL and return clean, LLM-ready markdown content." + }, + { + "name": "Scrape HTML", + "description": "Scrape any URL and return the raw HTML content of the page." + }, + { + "name": "Screenshot", + "description": "Capture a screenshot of any web page and store it as a downloadable image file." + }, + { + "name": "Crawl Website", + "description": "Crawl an entire website and return each discovered page as clean markdown." + }, + { + "name": "Map Sitemap", + "description": "Build a sitemap of a domain and return every discovered page URL." + }, + { + "name": "Web Search", + "description": "Search the web with natural language and optionally scrape results to markdown." + }, + { + "name": "Extract Structured Data", + "description": "Crawl a website and extract structured data matching a provided JSON schema." + }, + { + "name": "Classify NAICS", + "description": "Classify a brand into NAICS industry codes from its domain or company name." + }, + { + "name": "Classify SIC", + "description": "Classify a brand into SIC industry codes from its domain or company name." + }, + { + "name": "Get Brand Data", + "description": "Retrieve brand data for a domain: logos, colors, backdrops, socials, address, and industry." + } + ], + "operationCount": 10, + "triggers": [], + "triggerCount": 0, + "authType": "api-key", + "category": "tools", + "integrationType": "search", + "tags": ["web-scraping", "enrichment", "automation"] + }, { "type": "convex", "slug": "convex", diff --git a/apps/sim/tools/context_dev/classify_naics.ts b/apps/sim/tools/context_dev/classify_naics.ts new file mode 100644 index 00000000000..1386b672487 --- /dev/null +++ b/apps/sim/tools/context_dev/classify_naics.ts @@ -0,0 +1,96 @@ +import type { + ContextDevClassifyNaicsParams, + ContextDevClassifyNaicsResponse, +} from '@/tools/context_dev/types' +import { CLASSIFICATION_CODE_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevClassifyNaicsTool: ToolConfig< + ContextDevClassifyNaicsParams, + ContextDevClassifyNaicsResponse +> = { + id: 'context_dev_classify_naics', + name: 'Context.dev Classify NAICS', + description: 'Classify a brand into NAICS industry codes from its domain or company name.', + version: '1.0.0', + + params: { + input: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Brand domain or company name to classify (e.g., "stripe.com" or "Stripe")', + }, + minResults: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Minimum number of codes to return (1-10, default: 1)', + }, + maxResults: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of codes to return (1-10, default: 5)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/naics`) + appendParam(url.searchParams, 'input', params.input) + appendParam(url.searchParams, 'minResults', params.minResults) + appendParam(url.searchParams, 'maxResults', params.maxResults) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + domain: data.domain ?? null, + type: data.type ?? null, + codes: data.codes ?? [], + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Classification status' }, + domain: { type: 'string', description: 'Resolved domain', optional: true }, + type: { type: 'string', description: 'Input type that was resolved', optional: true }, + codes: { + type: 'array', + description: 'Matched NAICS codes with name and confidence', + items: { type: 'object', properties: CLASSIFICATION_CODE_OUTPUT_PROPERTIES }, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/classify_sic.ts b/apps/sim/tools/context_dev/classify_sic.ts new file mode 100644 index 00000000000..a45b9ebb5d6 --- /dev/null +++ b/apps/sim/tools/context_dev/classify_sic.ts @@ -0,0 +1,117 @@ +import type { + ContextDevClassifySicParams, + ContextDevClassifySicResponse, +} from '@/tools/context_dev/types' +import { CLASSIFICATION_CODE_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevClassifySicTool: ToolConfig< + ContextDevClassifySicParams, + ContextDevClassifySicResponse +> = { + id: 'context_dev_classify_sic', + name: 'Context.dev Classify SIC', + description: 'Classify a brand into SIC industry codes from its domain or company name.', + version: '1.0.0', + + params: { + input: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Brand domain or company name to classify (e.g., "stripe.com" or "Stripe")', + }, + type: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'SIC taxonomy version: "original_sic" (default) or "latest_sec"', + }, + minResults: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Minimum number of codes to return (1-10, default: 1)', + }, + maxResults: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of codes to return (1-10, default: 5)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/sic`) + appendParam(url.searchParams, 'input', params.input) + appendParam(url.searchParams, 'type', params.type) + appendParam(url.searchParams, 'minResults', params.minResults) + appendParam(url.searchParams, 'maxResults', params.maxResults) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + domain: data.domain ?? null, + type: data.type ?? null, + classification: data.classification ?? null, + codes: data.codes ?? [], + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Classification status' }, + domain: { type: 'string', description: 'Resolved domain', optional: true }, + type: { type: 'string', description: 'Input type that was resolved', optional: true }, + classification: { + type: 'string', + description: 'SIC taxonomy version used (original_sic or latest_sec)', + optional: true, + }, + codes: { + type: 'array', + description: 'Matched SIC codes with name, confidence, and group metadata', + items: { + type: 'object', + properties: { + ...CLASSIFICATION_CODE_OUTPUT_PROPERTIES, + majorGroup: { type: 'string', description: 'Major group code (original_sic only)' }, + majorGroupName: { type: 'string', description: 'Major group name (original_sic only)' }, + office: { type: 'string', description: 'SEC office (latest_sec only)' }, + }, + }, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/crawl.ts b/apps/sim/tools/context_dev/crawl.ts new file mode 100644 index 00000000000..6184899363b --- /dev/null +++ b/apps/sim/tools/context_dev/crawl.ts @@ -0,0 +1,144 @@ +import type { ContextDevCrawlParams, ContextDevCrawlResponse } from '@/tools/context_dev/types' +import { CRAWL_RESULT_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevCrawlTool: ToolConfig = { + id: 'context_dev_crawl', + name: 'Context.dev Crawl', + description: 'Crawl an entire website and return each discovered page as clean markdown.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The starting URL to crawl (must include http:// or https://)', + }, + maxPages: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of pages to crawl (1-500, default: 100)', + }, + maxDepth: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum link depth from the starting URL (0 = start page only)', + }, + urlRegex: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Regex pattern to filter which URLs are crawled', + }, + includeLinks: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Preserve hyperlinks in the markdown output (default: true)', + }, + includeImages: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Include image references in the markdown output (default: false)', + }, + useMainContentOnly: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Strip headers, footers, and sidebars from each page (default: false)', + }, + followSubdomains: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Follow links to subdomains of the starting domain (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 86400000)', + }, + waitForMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Browser wait time after page load in milliseconds (0-30000)', + }, + stopAfterMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Soft crawl time budget in milliseconds (10000-110000, default: 80000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/web/crawl`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { url: params.url } + if (params.maxPages != null) body.maxPages = params.maxPages + if (params.maxDepth != null) body.maxDepth = params.maxDepth + if (params.urlRegex) body.urlRegex = params.urlRegex + if (params.includeLinks != null) body.includeLinks = params.includeLinks + if (params.includeImages != null) body.includeImages = params.includeImages + if (params.useMainContentOnly != null) body.useMainContentOnly = params.useMainContentOnly + if (params.followSubdomains != null) body.followSubdomains = params.followSubdomains + if (params.maxAgeMs != null) body.maxAgeMs = params.maxAgeMs + if (params.waitForMs != null) body.waitForMs = params.waitForMs + if (params.stopAfterMs != null) body.stopAfterMs = params.stopAfterMs + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + results: data.results ?? [], + metadata: data.metadata ?? {}, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + results: { + type: 'array', + description: 'Crawled pages with markdown content and per-page metadata', + items: { type: 'object', properties: CRAWL_RESULT_OUTPUT_PROPERTIES }, + }, + metadata: { + type: 'object', + description: 'Crawl summary (numUrls, maxCrawlDepth, numSucceeded, numFailed, numSkipped)', + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/extract.ts b/apps/sim/tools/context_dev/extract.ts new file mode 100644 index 00000000000..21cefdb1e5d --- /dev/null +++ b/apps/sim/tools/context_dev/extract.ts @@ -0,0 +1,135 @@ +import type { ContextDevExtractParams, ContextDevExtractResponse } from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevExtractTool: ToolConfig = + { + id: 'context_dev_extract', + name: 'Context.dev Extract', + description: 'Crawl a website and extract structured data matching a provided JSON schema.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The starting website URL (must include http:// or https://)', + }, + schema: { + type: 'json', + required: true, + visibility: 'user-or-llm', + description: 'JSON Schema describing the structure of the data to extract', + }, + instructions: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Optional extraction guidance for link prioritization (max 2000 chars)', + }, + factCheck: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Require extracted values to be grounded in page facts (default: false)', + }, + followSubdomains: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Follow links on subdomains of the starting domain (default: false)', + }, + maxPages: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of pages to analyze (1-50, default: 5)', + }, + maxDepth: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum link depth from the starting URL', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 604800000)', + }, + stopAfterMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Soft crawl time budget in milliseconds (10000-110000, default: 80000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/web/extract`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { url: params.url, schema: params.schema } + if (params.instructions) body.instructions = params.instructions + if (params.factCheck != null) body.factCheck = params.factCheck + if (params.followSubdomains != null) body.followSubdomains = params.followSubdomains + if (params.maxPages != null) body.maxPages = params.maxPages + if (params.maxDepth != null) body.maxDepth = params.maxDepth + if (params.maxAgeMs != null) body.maxAgeMs = params.maxAgeMs + if (params.stopAfterMs != null) body.stopAfterMs = params.stopAfterMs + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + url: data.url ?? '', + urlsAnalyzed: data.urls_analyzed ?? [], + data: data.data ?? {}, + metadata: data.metadata ?? {}, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Extraction status' }, + url: { type: 'string', description: 'The starting URL that was crawled' }, + urlsAnalyzed: { + type: 'array', + description: 'URLs that were analyzed during extraction', + items: { type: 'string', description: 'Analyzed page URL' }, + }, + data: { type: 'json', description: 'Structured data matching the requested schema' }, + metadata: { + type: 'object', + description: 'Crawl summary (numUrls, maxCrawlDepth, numSucceeded, numFailed, numSkipped)', + }, + ...CREDIT_OUTPUTS, + }, + } diff --git a/apps/sim/tools/context_dev/get_brand.ts b/apps/sim/tools/context_dev/get_brand.ts new file mode 100644 index 00000000000..26a66596511 --- /dev/null +++ b/apps/sim/tools/context_dev/get_brand.ts @@ -0,0 +1,116 @@ +import type { + ContextDevGetBrandParams, + ContextDevGetBrandResponse, +} from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevGetBrandTool: ToolConfig< + ContextDevGetBrandParams, + ContextDevGetBrandResponse +> = { + id: 'context_dev_get_brand', + name: 'Context.dev Get Brand', + description: + 'Retrieve brand data for a domain: logos, colors, backdrops, socials, address, and industry.', + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to retrieve brand data for (e.g., "airbnb.com")', + }, + forceLanguage: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Override the detected language with a supported language code', + }, + maxSpeed: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Skip time-consuming operations for a faster response (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve`) + appendParam(url.searchParams, 'domain', params.domain) + appendParam(url.searchParams, 'force_language', params.forceLanguage) + appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + brand: data.brand ?? null, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Retrieval status' }, + brand: { + type: 'object', + description: 'Brand data object', + properties: { + domain: { type: 'string', description: 'Brand domain' }, + title: { type: 'string', description: 'Brand title' }, + description: { type: 'string', description: 'Brand description' }, + slogan: { type: 'string', description: 'Brand slogan' }, + colors: { type: 'json', description: 'Brand colors (hex and name)' }, + logos: { type: 'json', description: 'Brand logos with mode, colors, resolution, and type' }, + backdrops: { type: 'json', description: 'Brand backdrop images' }, + socials: { type: 'json', description: 'Social media profiles (type and url)' }, + address: { type: 'json', description: 'Brand address' }, + stock: { type: 'json', description: 'Stock info (ticker and exchange)' }, + is_nsfw: { type: 'boolean', description: 'Whether the brand contains adult content' }, + email: { type: 'string', description: 'Brand contact email' }, + phone: { type: 'string', description: 'Brand contact phone' }, + industries: { type: 'json', description: 'Industry taxonomy (eic pairs)' }, + links: { type: 'json', description: 'Key brand links (careers, privacy, terms, etc.)' }, + primary_language: { type: 'string', description: 'Primary language of the brand site' }, + }, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/index.ts b/apps/sim/tools/context_dev/index.ts new file mode 100644 index 00000000000..1fb08c8881d --- /dev/null +++ b/apps/sim/tools/context_dev/index.ts @@ -0,0 +1,10 @@ +export { contextDevClassifyNaicsTool } from '@/tools/context_dev/classify_naics' +export { contextDevClassifySicTool } from '@/tools/context_dev/classify_sic' +export { contextDevCrawlTool } from '@/tools/context_dev/crawl' +export { contextDevExtractTool } from '@/tools/context_dev/extract' +export { contextDevGetBrandTool } from '@/tools/context_dev/get_brand' +export { contextDevMapTool } from '@/tools/context_dev/map' +export { contextDevScrapeHtmlTool } from '@/tools/context_dev/scrape_html' +export { contextDevScrapeMarkdownTool } from '@/tools/context_dev/scrape_markdown' +export { contextDevScreenshotTool } from '@/tools/context_dev/screenshot' +export { contextDevSearchTool } from '@/tools/context_dev/search' diff --git a/apps/sim/tools/context_dev/map.ts b/apps/sim/tools/context_dev/map.ts new file mode 100644 index 00000000000..c8491b02d5e --- /dev/null +++ b/apps/sim/tools/context_dev/map.ts @@ -0,0 +1,90 @@ +import type { ContextDevMapParams, ContextDevMapResponse } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevMapTool: ToolConfig = { + id: 'context_dev_map', + name: 'Context.dev Map', + description: 'Build a sitemap of a domain and return every discovered page URL.', + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to build a sitemap for (e.g., "example.com")', + }, + maxLinks: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of URLs to return (1-100000, default: 10000)', + }, + urlRegex: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'RE2-compatible regex to filter URLs (max 256 chars)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/scrape/sitemap`) + appendParam(url.searchParams, 'domain', params.domain) + appendParam(url.searchParams, 'maxLinks', params.maxLinks) + appendParam(url.searchParams, 'urlRegex', params.urlRegex) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + domain: data.domain ?? '', + urls: data.urls ?? [], + meta: data.meta ?? {}, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + domain: { type: 'string', description: 'The domain that was mapped' }, + urls: { + type: 'array', + description: 'All page URLs discovered from the sitemap', + items: { type: 'string', description: 'Page URL' }, + }, + meta: { + type: 'object', + description: 'Sitemap discovery stats (sitemapsDiscovered, sitemapsFetched, errors)', + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/scrape_html.ts b/apps/sim/tools/context_dev/scrape_html.ts new file mode 100644 index 00000000000..b4cf5d42e2f --- /dev/null +++ b/apps/sim/tools/context_dev/scrape_html.ts @@ -0,0 +1,106 @@ +import type { + ContextDevScrapeHtmlParams, + ContextDevScrapeHtmlResponse, +} from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevScrapeHtmlTool: ToolConfig< + ContextDevScrapeHtmlParams, + ContextDevScrapeHtmlResponse +> = { + id: 'context_dev_scrape_html', + name: 'Context.dev Scrape HTML', + description: 'Scrape any URL and return the raw HTML content of the page.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The full URL to scrape (must include http:// or https://)', + }, + useMainContentOnly: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Return only main content, excluding headers, footers, and navigation', + }, + includeFrames: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Render iframe contents inline into the returned HTML (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 86400000)', + }, + waitForMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Browser wait time after page load in milliseconds (0-30000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/scrape/html`) + appendParam(url.searchParams, 'url', params.url) + appendParam(url.searchParams, 'useMainContentOnly', params.useMainContentOnly) + appendParam(url.searchParams, 'includeFrames', params.includeFrames) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'waitForMs', params.waitForMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + html: data.html ?? '', + url: data.url ?? '', + type: data.type ?? 'html', + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + html: { type: 'string', description: 'Raw HTML content of the page' }, + url: { type: 'string', description: 'The scraped URL' }, + type: { + type: 'string', + description: 'Detected content type (html, xml, json, text, csv, markdown, svg, pdf)', + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/scrape_markdown.ts b/apps/sim/tools/context_dev/scrape_markdown.ts new file mode 100644 index 00000000000..8faed1c8a57 --- /dev/null +++ b/apps/sim/tools/context_dev/scrape_markdown.ts @@ -0,0 +1,115 @@ +import type { + ContextDevScrapeMarkdownParams, + ContextDevScrapeMarkdownResponse, +} from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevScrapeMarkdownTool: ToolConfig< + ContextDevScrapeMarkdownParams, + ContextDevScrapeMarkdownResponse +> = { + id: 'context_dev_scrape_markdown', + name: 'Context.dev Scrape Markdown', + description: 'Scrape any URL and return clean, LLM-ready markdown content.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The full URL to scrape (must include http:// or https://)', + }, + useMainContentOnly: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Return only main content, excluding headers, footers, and navigation', + }, + includeLinks: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Preserve hyperlinks in the markdown output (default: true)', + }, + includeImages: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Include image references in the markdown output (default: false)', + }, + includeFrames: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Render iframe contents inline (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 86400000)', + }, + waitForMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Browser wait time after page load in milliseconds (0-30000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/scrape/markdown`) + appendParam(url.searchParams, 'url', params.url) + appendParam(url.searchParams, 'useMainContentOnly', params.useMainContentOnly) + appendParam(url.searchParams, 'includeLinks', params.includeLinks) + appendParam(url.searchParams, 'includeImages', params.includeImages) + appendParam(url.searchParams, 'includeFrames', params.includeFrames) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'waitForMs', params.waitForMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + markdown: data.markdown ?? '', + url: data.url ?? '', + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + markdown: { type: 'string', description: 'Page content as clean markdown' }, + url: { type: 'string', description: 'The scraped URL' }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/screenshot.ts b/apps/sim/tools/context_dev/screenshot.ts new file mode 100644 index 00000000000..58e92a1b565 --- /dev/null +++ b/apps/sim/tools/context_dev/screenshot.ts @@ -0,0 +1,138 @@ +import type { + ContextDevScreenshotParams, + ContextDevScreenshotResponse, +} from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig, ToolFileData } from '@/tools/types' + +export const contextDevScreenshotTool: ToolConfig< + ContextDevScreenshotParams, + ContextDevScreenshotResponse +> = { + id: 'context_dev_screenshot', + name: 'Context.dev Screenshot', + description: 'Capture a screenshot of any web page and store it as a downloadable image file.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The full URL to capture (must include http:// or https://)', + }, + fullScreenshot: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Capture the full scrollable page instead of just the viewport (default: false)', + }, + handleCookiePopup: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Attempt to dismiss cookie banners before capturing (default: false)', + }, + viewportWidth: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Viewport width in pixels (240-7680, default: 1920)', + }, + viewportHeight: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Viewport height in pixels (240-4320, default: 1080)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 86400000)', + }, + waitForMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Post-load delay before capturing in milliseconds (0-30000, default: 3000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/screenshot`) + appendParam(url.searchParams, 'directUrl', params.url) + appendParam(url.searchParams, 'fullScreenshot', params.fullScreenshot) + appendParam(url.searchParams, 'handleCookiePopup', params.handleCookiePopup) + appendParam(url.searchParams, 'viewport[width]', params.viewportWidth) + appendParam(url.searchParams, 'viewport[height]', params.viewportHeight) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'waitForMs', params.waitForMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + const screenshotUrl: string = data.screenshot ?? '' + const domain: string | null = data.domain ?? null + + const file: ToolFileData | undefined = screenshotUrl + ? { + name: `${domain ?? 'screenshot'}.png`, + mimeType: 'image/png', + url: screenshotUrl, + } + : undefined + + return { + success: true, + output: { + ...(file ? { file } : {}), + screenshotUrl, + screenshotType: data.screenshotType ?? null, + domain, + width: data.width ?? null, + height: data.height ?? null, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + file: { type: 'file', description: 'Stored screenshot image file', optional: true }, + screenshotUrl: { type: 'string', description: 'Public URL of the captured screenshot' }, + screenshotType: { + type: 'string', + description: 'Screenshot type (viewport or fullPage)', + optional: true, + }, + domain: { type: 'string', description: 'Domain that was captured', optional: true }, + width: { type: 'number', description: 'Screenshot width in pixels', optional: true }, + height: { type: 'number', description: 'Screenshot height in pixels', optional: true }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/search.ts b/apps/sim/tools/context_dev/search.ts new file mode 100644 index 00000000000..2fd150d0b41 --- /dev/null +++ b/apps/sim/tools/context_dev/search.ts @@ -0,0 +1,108 @@ +import type { ContextDevSearchParams, ContextDevSearchResponse } from '@/tools/context_dev/types' +import { SEARCH_RESULT_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevSearchTool: ToolConfig = { + id: 'context_dev_search', + name: 'Context.dev Search', + description: 'Search the web with natural language and optionally scrape results to markdown.', + version: '1.0.0', + + params: { + query: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The natural language search query (1-500 characters)', + }, + includeDomains: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: 'Only return results from these domains', + }, + excludeDomains: { + type: 'array', + required: false, + visibility: 'user-or-llm', + description: 'Exclude results from these domains', + }, + freshness: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Recency filter (last_24_hours, last_week, last_month, last_year)', + }, + queryFanout: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Expand the query into parallel variants for broader coverage', + }, + markdownEnabled: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Scrape each result page to markdown (default: false)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/web/search`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { query: params.query } + if (params.includeDomains?.length) body.includeDomains = params.includeDomains + if (params.excludeDomains?.length) body.excludeDomains = params.excludeDomains + if (params.freshness) body.freshness = params.freshness + if (params.queryFanout != null) body.queryFanout = params.queryFanout + if (params.markdownEnabled != null) { + body.markdownOptions = { enabled: params.markdownEnabled } + } + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + results: data.results ?? [], + query: data.query ?? '', + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + results: { + type: 'array', + description: 'Search results with url, title, description, relevance, and optional markdown', + items: { type: 'object', properties: SEARCH_RESULT_OUTPUT_PROPERTIES }, + }, + query: { type: 'string', description: 'The query that was searched' }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/types.ts b/apps/sim/tools/context_dev/types.ts new file mode 100644 index 00000000000..4e7d1734318 --- /dev/null +++ b/apps/sim/tools/context_dev/types.ts @@ -0,0 +1,228 @@ +import type { ToolFileData, ToolResponse } from '@/tools/types' + +/** Credit accounting fields surfaced on every Context.dev tool output. */ +interface CreditFields { + creditsConsumed: number | null + creditsRemaining: number | null +} + +export interface ContextDevScrapeMarkdownParams { + apiKey: string + url: string + useMainContentOnly?: boolean + includeLinks?: boolean + includeImages?: boolean + includeFrames?: boolean + maxAgeMs?: number + waitForMs?: number + timeoutMS?: number +} + +export interface ContextDevScrapeMarkdownResponse extends ToolResponse { + output: CreditFields & { + markdown: string + url: string + } +} + +export interface ContextDevScrapeHtmlParams { + apiKey: string + url: string + useMainContentOnly?: boolean + includeFrames?: boolean + maxAgeMs?: number + waitForMs?: number + timeoutMS?: number +} + +export interface ContextDevScrapeHtmlResponse extends ToolResponse { + output: CreditFields & { + html: string + url: string + type: string + } +} + +export interface ContextDevScreenshotParams { + apiKey: string + url: string + fullScreenshot?: boolean + handleCookiePopup?: boolean + viewportWidth?: number + viewportHeight?: number + maxAgeMs?: number + waitForMs?: number + timeoutMS?: number +} + +export interface ContextDevScreenshotResponse extends ToolResponse { + output: CreditFields & { + file?: ToolFileData + screenshotUrl: string + screenshotType: string | null + domain: string | null + width: number | null + height: number | null + } +} + +export interface ContextDevCrawlParams { + apiKey: string + url: string + maxPages?: number + maxDepth?: number + urlRegex?: string + includeLinks?: boolean + includeImages?: boolean + useMainContentOnly?: boolean + followSubdomains?: boolean + maxAgeMs?: number + waitForMs?: number + stopAfterMs?: number + timeoutMS?: number +} + +export interface ContextDevCrawlResponse extends ToolResponse { + output: CreditFields & { + results: Array<{ + markdown: string + metadata: Record + }> + metadata: Record + } +} + +export interface ContextDevMapParams { + apiKey: string + domain: string + maxLinks?: number + urlRegex?: string + timeoutMS?: number +} + +export interface ContextDevMapResponse extends ToolResponse { + output: CreditFields & { + domain: string + urls: string[] + meta: Record + } +} + +export interface ContextDevSearchParams { + apiKey: string + query: string + includeDomains?: string[] + excludeDomains?: string[] + freshness?: string + queryFanout?: boolean + markdownEnabled?: boolean + timeoutMS?: number +} + +export interface ContextDevSearchResponse extends ToolResponse { + output: CreditFields & { + results: Array> + query: string + } +} + +export interface ContextDevExtractParams { + apiKey: string + url: string + schema: Record + instructions?: string + factCheck?: boolean + followSubdomains?: boolean + maxPages?: number + maxDepth?: number + maxAgeMs?: number + stopAfterMs?: number + timeoutMS?: number +} + +export interface ContextDevExtractResponse extends ToolResponse { + output: CreditFields & { + status: string + url: string + urlsAnalyzed: string[] + data: Record + metadata: Record + } +} + +export interface ContextDevClassifyNaicsParams { + apiKey: string + input: string + minResults?: number + maxResults?: number + timeoutMS?: number +} + +export interface ContextDevClassifyNaicsResponse extends ToolResponse { + output: CreditFields & { + status: string + domain: string | null + type: string | null + codes: Array> + } +} + +export interface ContextDevClassifySicParams { + apiKey: string + input: string + type?: string + minResults?: number + maxResults?: number + timeoutMS?: number +} + +export interface ContextDevClassifySicResponse extends ToolResponse { + output: CreditFields & { + status: string + domain: string | null + type: string | null + classification: string | null + codes: Array> + } +} + +export interface ContextDevGetBrandParams { + apiKey: string + domain: string + forceLanguage?: string + maxSpeed?: boolean + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevGetBrandResponse extends ToolResponse { + output: CreditFields & { + status: string + brand: Record | null + } +} + +/** Output schema for a single web search result. */ +export const SEARCH_RESULT_OUTPUT_PROPERTIES = { + url: { type: 'string', description: 'Result page URL' }, + title: { type: 'string', description: 'Result page title' }, + description: { type: 'string', description: 'Result snippet/description' }, + relevance: { type: 'string', description: 'Relevance rating (high, medium, low)' }, + markdown: { + type: 'json', + description: 'Scraped markdown for the result (when markdown scraping is enabled)', + }, +} as const + +/** Output schema for a single crawled page. */ +export const CRAWL_RESULT_OUTPUT_PROPERTIES = { + markdown: { type: 'string', description: 'Page content as markdown' }, + metadata: { type: 'json', description: 'Page metadata (url, title, crawlDepth, statusCode)' }, +} as const + +/** Output schema for a single industry classification code. */ +export const CLASSIFICATION_CODE_OUTPUT_PROPERTIES = { + code: { type: 'string', description: 'Industry code' }, + name: { type: 'string', description: 'Industry name' }, + confidence: { type: 'string', description: 'Match confidence (high, medium, low)' }, +} as const diff --git a/apps/sim/tools/context_dev/utils.ts b/apps/sim/tools/context_dev/utils.ts new file mode 100644 index 00000000000..f6243bef924 --- /dev/null +++ b/apps/sim/tools/context_dev/utils.ts @@ -0,0 +1,80 @@ +/** Base URL for all Context.dev API endpoints. */ +export const CONTEXT_DEV_BASE_URL = 'https://api.context.dev/v1' + +/** + * Builds the standard Context.dev request headers with Bearer authentication. + */ +export function contextDevHeaders(apiKey: string): Record { + return { + Authorization: `Bearer ${apiKey}`, + Accept: 'application/json', + } +} + +/** + * Builds JSON request headers with Bearer authentication for POST endpoints. + */ +export function contextDevJsonHeaders(apiKey: string): Record { + return { + ...contextDevHeaders(apiKey), + 'Content-Type': 'application/json', + } +} + +/** + * Throws a descriptive error when a Context.dev response is not successful. + * Returns the parsed JSON body on success. + */ +export async function parseContextDevResponse(response: Response): Promise { + if (!response.ok) { + const errorText = await response.text() + throw new Error(`Context.dev API error (${response.status}): ${errorText}`) + } + return response.json() +} + +/** Shape of the credit accounting object present on every Context.dev response. */ +interface ContextDevKeyMetadata { + credits_consumed?: number + credits_remaining?: number +} + +/** + * Extracts the credit accounting fields shared by every Context.dev response. + */ +export function extractCreditMetadata(keyMetadata: ContextDevKeyMetadata | undefined): { + creditsConsumed: number | null + creditsRemaining: number | null +} { + return { + creditsConsumed: keyMetadata?.credits_consumed ?? null, + creditsRemaining: keyMetadata?.credits_remaining ?? null, + } +} + +/** + * Appends a parameter to a URLSearchParams instance only when it is defined and non-empty. + * Booleans are serialized as the literal strings 'true' / 'false'. + */ +export function appendParam( + search: URLSearchParams, + key: string, + value: string | number | boolean | undefined | null +): void { + if (value === undefined || value === null || value === '') return + search.append(key, String(value)) +} + +/** Output definitions for the credit accounting fields, reused across every tool. */ +export const CREDIT_OUTPUTS = { + creditsConsumed: { + type: 'number', + description: 'Credits consumed by this request', + optional: true, + }, + creditsRemaining: { + type: 'number', + description: 'Credits remaining on the API key', + optional: true, + }, +} as const diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts index c8f7422f592..6aec72892a4 100644 --- a/apps/sim/tools/registry.ts +++ b/apps/sim/tools/registry.ts @@ -522,6 +522,18 @@ import { confluenceUpdateTool, confluenceUploadAttachmentTool, } from '@/tools/confluence' +import { + contextDevClassifyNaicsTool, + contextDevClassifySicTool, + contextDevCrawlTool, + contextDevExtractTool, + contextDevGetBrandTool, + contextDevMapTool, + contextDevScrapeHtmlTool, + contextDevScrapeMarkdownTool, + contextDevScreenshotTool, + contextDevSearchTool, +} from '@/tools/context_dev' import { convexActionTool, convexDocumentDeltasTool, @@ -5491,6 +5503,16 @@ export const tools: Record = { confluence_list_space_properties: confluenceListSpacePropertiesTool, confluence_create_space_property: confluenceCreateSpacePropertyTool, confluence_delete_space_property: confluenceDeleteSpacePropertyTool, + context_dev_scrape_markdown: contextDevScrapeMarkdownTool, + context_dev_scrape_html: contextDevScrapeHtmlTool, + context_dev_screenshot: contextDevScreenshotTool, + context_dev_crawl: contextDevCrawlTool, + context_dev_map: contextDevMapTool, + context_dev_search: contextDevSearchTool, + context_dev_extract: contextDevExtractTool, + context_dev_classify_naics: contextDevClassifyNaicsTool, + context_dev_classify_sic: contextDevClassifySicTool, + context_dev_get_brand: contextDevGetBrandTool, cursor_list_agents: cursorListAgentsTool, cursor_list_agents_v2: cursorListAgentsV2Tool, cursor_get_agent: cursorGetAgentTool, From c8ad62aacb4a6534cd529018d62e6be5a6120f93 Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 14 Jun 2026 19:31:14 -0700 Subject: [PATCH 2/3] feat(context-dev): add brand variants, products, fonts, styleguide, images, prefetch Expands coverage to all relevant Context.dev endpoints (22 tools): brand by name/email/ticker, simplified brand, transaction identifier, single + catalog product extraction, fonts, styleguide, image discovery, and prefetch utilities. Shared brand output schema and transform helper; verified against the live API. --- .../docs/en/integrations/context_dev.mdx | 378 +++++++++++++++++- apps/sim/blocks/blocks/context_dev.ts | 340 ++++++++++++++-- apps/sim/lib/integrations/integrations.json | 54 ++- apps/sim/tools/context_dev/extract_product.ts | 90 +++++ .../sim/tools/context_dev/extract_products.ts | 89 +++++ apps/sim/tools/context_dev/get_brand.ts | 170 ++++---- .../tools/context_dev/get_brand_by_email.ts | 93 +++++ .../tools/context_dev/get_brand_by_name.ts | 100 +++++ .../tools/context_dev/get_brand_by_ticker.ts | 99 +++++ .../tools/context_dev/get_brand_simplified.ts | 78 ++++ .../tools/context_dev/identify_transaction.ts | 121 ++++++ apps/sim/tools/context_dev/index.ts | 12 + .../tools/context_dev/prefetch_by_email.ts | 75 ++++ apps/sim/tools/context_dev/prefetch_domain.ts | 75 ++++ apps/sim/tools/context_dev/scrape_fonts.ts | 92 +++++ apps/sim/tools/context_dev/scrape_images.ts | 115 ++++++ .../tools/context_dev/scrape_styleguide.ts | 87 ++++ apps/sim/tools/context_dev/types.ts | 238 ++++++++++- apps/sim/tools/context_dev/utils.ts | 17 + apps/sim/tools/registry.ts | 24 ++ 20 files changed, 2213 insertions(+), 134 deletions(-) create mode 100644 apps/sim/tools/context_dev/extract_product.ts create mode 100644 apps/sim/tools/context_dev/extract_products.ts create mode 100644 apps/sim/tools/context_dev/get_brand_by_email.ts create mode 100644 apps/sim/tools/context_dev/get_brand_by_name.ts create mode 100644 apps/sim/tools/context_dev/get_brand_by_ticker.ts create mode 100644 apps/sim/tools/context_dev/get_brand_simplified.ts create mode 100644 apps/sim/tools/context_dev/identify_transaction.ts create mode 100644 apps/sim/tools/context_dev/prefetch_by_email.ts create mode 100644 apps/sim/tools/context_dev/prefetch_domain.ts create mode 100644 apps/sim/tools/context_dev/scrape_fonts.ts create mode 100644 apps/sim/tools/context_dev/scrape_images.ts create mode 100644 apps/sim/tools/context_dev/scrape_styleguide.ts diff --git a/apps/docs/content/docs/en/integrations/context_dev.mdx b/apps/docs/content/docs/en/integrations/context_dev.mdx index 7e02f980ccd..1da78894369 100644 --- a/apps/docs/content/docs/en/integrations/context_dev.mdx +++ b/apps/docs/content/docs/en/integrations/context_dev.mdx @@ -12,7 +12,7 @@ import { BlockInfoCard } from "@/components/ui/block-info-card" ## Usage Instructions -Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, crawl entire sites, map sitemaps, search the web, extract structured data, classify industries, and retrieve brand assets — all from one API. +Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, list images, crawl entire sites, map sitemaps, search the web, extract structured data and products, pull design systems, classify industries, and retrieve brand assets by domain, name, email, ticker, or transaction — all from one API. @@ -67,6 +67,36 @@ Scrape any URL and return the raw HTML content of the page. | `url` | string | The scraped URL | | `type` | string | Detected content type \(html, xml, json, text, csv, markdown, svg, pdf\) | +### `context_dev_scrape_images` + +Discover every image asset on a page, with optional dimension and type enrichment. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The full URL to scrape images from \(must include http:// or https://\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 86400000\) | +| `waitForMs` | number | No | Browser wait time after page load in milliseconds \(0-30000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `enrichResolution` | boolean | No | Measure image dimensions \(enables 5-credit enrichment\) | +| `enrichHostedUrl` | boolean | No | Host images on a CDN and return their URL and MIME type \(enables enrichment\) | +| `enrichClassification` | boolean | No | Classify each image by visual asset type \(enables enrichment\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `success` | boolean | Whether the scrape succeeded | +| `images` | array | Discovered image assets with source, element, type, and optional enrichment | +| ↳ `src` | string | Image source URL or data | +| ↳ `element` | string | Source element \(img, svg, link, source, video, css, object, meta, background\) | +| ↳ `type` | string | Image representation \(url, html, base64\) | +| ↳ `alt` | string | Alt text | +| ↳ `enrichment` | json | Optional enrichment \(width, height, mimetype, url, type\) when requested | +| `url` | string | The scraped URL | + ### `context_dev_screenshot` Capture a screenshot of any web page and store it as a downloadable image file. @@ -208,6 +238,125 @@ Crawl a website and extract structured data matching a provided JSON schema. | `data` | json | Structured data matching the requested schema | | `metadata` | object | Crawl summary \(numUrls, maxCrawlDepth, numSucceeded, numFailed, numSkipped\) | +### `context_dev_extract_product` + +Detect and extract structured product details from a single product page URL. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `url` | string | Yes | The product page URL \(must include http:// or https://\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 604800000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `isProductPage` | boolean | Whether the URL is a product page | +| `platform` | string | Detected platform \(amazon, tiktok_shop, etsy, generic\) | +| `product` | object | Extracted product details | +| ↳ `name` | string | Product name | +| ↳ `description` | string | Product description | +| ↳ `price` | number | Product price | +| ↳ `currency` | string | Price currency | +| ↳ `billing_frequency` | string | Billing frequency \(monthly, yearly, one_time, usage_based\) | +| ↳ `pricing_model` | string | Pricing model \(per_seat, flat, tiered, freemium, custom\) | +| ↳ `url` | string | Product URL | +| ↳ `category` | string | Product category | +| ↳ `features` | json | Product features | +| ↳ `target_audience` | json | Target audience | +| ↳ `tags` | json | Product tags | +| ↳ `image_url` | string | Primary product image URL | +| ↳ `images` | json | Product image URLs | +| ↳ `sku` | string | Product SKU | + +### `context_dev_extract_products` + +Extract the product catalog from a brand + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to extract products from \(e.g., "example.com"\) | +| `maxProducts` | number | No | Maximum number of products to extract \(1-12\) | +| `maxAgeMs` | number | No | Cache duration in milliseconds \(0-2592000000, default: 604800000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `products` | array | Extracted products with pricing, features, and metadata | +| ↳ `name` | string | Product name | +| ↳ `description` | string | Product description | +| ↳ `price` | number | Product price | +| ↳ `currency` | string | Price currency | +| ↳ `billing_frequency` | string | Billing frequency \(monthly, yearly, one_time, usage_based\) | +| ↳ `pricing_model` | string | Pricing model \(per_seat, flat, tiered, freemium, custom\) | +| ↳ `url` | string | Product URL | +| ↳ `category` | string | Product category | +| ↳ `features` | json | Product features | +| ↳ `target_audience` | json | Target audience | +| ↳ `tags` | json | Product tags | +| ↳ `image_url` | string | Primary product image URL | +| ↳ `images` | json | Product image URLs | +| ↳ `sku` | string | Product SKU | + +### `context_dev_scrape_fonts` + +Extract the font families, usage stats, and font files used by a domain. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to extract fonts from \(e.g., "example.com"\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Extraction status | +| `domain` | string | The domain that was analyzed | +| `fonts` | array | Fonts with usage statistics and fallbacks | +| ↳ `font` | string | Font family name | +| ↳ `uses` | json | Where the font is used | +| ↳ `fallbacks` | json | Fallback font families | +| ↳ `num_elements` | number | Number of elements using the font | +| ↳ `num_words` | number | Number of words rendered in the font | +| ↳ `percent_words` | number | Percent of words using the font | +| ↳ `percent_elements` | number | Percent of elements using the font | +| `fontLinks` | json | Font family download links keyed by font name \(type, files, category\) | + +### `context_dev_scrape_styleguide` + +Extract a domain + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to extract the styleguide from \(e.g., "example.com"\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Extraction status | +| `domain` | string | The domain that was analyzed | +| `styleguide` | json | Design system: mode, colors, typography, elementSpacing, shadows, fontLinks, components | + ### `context_dev_classify_naics` Classify a brand into NAICS industry codes from its domain or company name. @@ -299,8 +448,231 @@ Retrieve brand data for a domain: logos, colors, backdrops, socials, address, an | ↳ `is_nsfw` | boolean | Whether the brand contains adult content | | ↳ `email` | string | Brand contact email | | ↳ `phone` | string | Brand contact phone | -| ↳ `industries` | json | Industry taxonomy \(eic pairs\) | -| ↳ `links` | json | Key brand links \(careers, privacy, terms, etc.\) | +| ↳ `industries` | json | Industry taxonomy \(eic industry/subindustry pairs\) | +| ↳ `links` | json | Key brand links \(careers, privacy, terms, blog, pricing\) | | ↳ `primary_language` | string | Primary language of the brand site | +### `context_dev_get_brand_by_name` + +Retrieve brand data by company name: logos, colors, socials, address, and industry. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `name` | string | Yes | Company name to retrieve brand data for \(3-30 chars, e.g., "Apple Inc"\) | +| `countryGl` | string | No | ISO 2-letter country code to prioritize \(e.g., "us"\) | +| `forceLanguage` | string | No | Override the detected language with a supported language code | +| `maxSpeed` | boolean | No | Skip time-consuming operations for a faster response \(default: false\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Retrieval status | +| `brand` | object | Brand data object | +| ↳ `domain` | string | Brand domain | +| ↳ `title` | string | Brand title | +| ↳ `description` | string | Brand description | +| ↳ `slogan` | string | Brand slogan | +| ↳ `colors` | json | Brand colors \(hex and name\) | +| ↳ `logos` | json | Brand logos with mode, colors, resolution, and type | +| ↳ `backdrops` | json | Brand backdrop images | +| ↳ `socials` | json | Social media profiles \(type and url\) | +| ↳ `address` | json | Brand address | +| ↳ `stock` | json | Stock info \(ticker and exchange\) | +| ↳ `is_nsfw` | boolean | Whether the brand contains adult content | +| ↳ `email` | string | Brand contact email | +| ↳ `phone` | string | Brand contact phone | +| ↳ `industries` | json | Industry taxonomy \(eic industry/subindustry pairs\) | +| ↳ `links` | json | Key brand links \(careers, privacy, terms, blog, pricing\) | +| ↳ `primary_language` | string | Primary language of the brand site | + +### `context_dev_get_brand_by_email` + +Retrieve brand data from a work email address. Free/disposable emails are rejected (422). + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `email` | string | Yes | Work email address; the domain is extracted \(free providers are rejected\) | +| `forceLanguage` | string | No | Override the detected language with a supported language code | +| `maxSpeed` | boolean | No | Skip time-consuming operations for a faster response \(default: false\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Retrieval status | +| `brand` | object | Brand data object | +| ↳ `domain` | string | Brand domain | +| ↳ `title` | string | Brand title | +| ↳ `description` | string | Brand description | +| ↳ `slogan` | string | Brand slogan | +| ↳ `colors` | json | Brand colors \(hex and name\) | +| ↳ `logos` | json | Brand logos with mode, colors, resolution, and type | +| ↳ `backdrops` | json | Brand backdrop images | +| ↳ `socials` | json | Social media profiles \(type and url\) | +| ↳ `address` | json | Brand address | +| ↳ `stock` | json | Stock info \(ticker and exchange\) | +| ↳ `is_nsfw` | boolean | Whether the brand contains adult content | +| ↳ `email` | string | Brand contact email | +| ↳ `phone` | string | Brand contact phone | +| ↳ `industries` | json | Industry taxonomy \(eic industry/subindustry pairs\) | +| ↳ `links` | json | Key brand links \(careers, privacy, terms, blog, pricing\) | +| ↳ `primary_language` | string | Primary language of the brand site | + +### `context_dev_get_brand_by_ticker` + +Retrieve brand data for a public company by its stock ticker symbol. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `ticker` | string | Yes | Stock ticker symbol \(e.g., "AAPL", "GOOGL", "BRK.A"\) | +| `tickerExchange` | string | No | Exchange code for the ticker \(e.g., "NASDAQ", "NYSE", "LSE"\). Default: NASDAQ | +| `forceLanguage` | string | No | Override the detected language with a supported language code | +| `maxSpeed` | boolean | No | Skip time-consuming operations for a faster response \(default: false\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Retrieval status | +| `brand` | object | Brand data object | +| ↳ `domain` | string | Brand domain | +| ↳ `title` | string | Brand title | +| ↳ `description` | string | Brand description | +| ↳ `slogan` | string | Brand slogan | +| ↳ `colors` | json | Brand colors \(hex and name\) | +| ↳ `logos` | json | Brand logos with mode, colors, resolution, and type | +| ↳ `backdrops` | json | Brand backdrop images | +| ↳ `socials` | json | Social media profiles \(type and url\) | +| ↳ `address` | json | Brand address | +| ↳ `stock` | json | Stock info \(ticker and exchange\) | +| ↳ `is_nsfw` | boolean | Whether the brand contains adult content | +| ↳ `email` | string | Brand contact email | +| ↳ `phone` | string | Brand contact phone | +| ↳ `industries` | json | Industry taxonomy \(eic industry/subindustry pairs\) | +| ↳ `links` | json | Key brand links \(careers, privacy, terms, blog, pricing\) | +| ↳ `primary_language` | string | Primary language of the brand site | + +### `context_dev_get_brand_simplified` + +Retrieve essential brand data for a domain: title, colors, logos, and backdrops. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to retrieve simplified brand data for \(e.g., "airbnb.com"\) | +| `maxAgeMs` | number | No | Cache max age in milliseconds \(86400000-31536000000, default: 7776000000\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Retrieval status | +| `brand` | object | Simplified brand data \(domain, title, colors, logos, backdrops\) | +| ↳ `domain` | string | Brand domain | +| ↳ `title` | string | Brand title | +| ↳ `colors` | json | Brand colors \(hex and name\) | +| ↳ `logos` | json | Brand logos with mode, colors, resolution, and type | +| ↳ `backdrops` | json | Brand backdrop images | + +### `context_dev_identify_transaction` + +Identify the brand behind a raw bank/card transaction descriptor and return its brand data. + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `transactionInfo` | string | Yes | The raw transaction descriptor or identifier to resolve to a brand | +| `countryGl` | string | No | ISO 2-letter country code from the transaction \(e.g., "us", "gb"\) | +| `city` | string | No | City name to prioritize in the search | +| `mcc` | string | No | Merchant Category Code for the business category | +| `phone` | number | No | Phone number from the transaction for verification | +| `highConfidenceOnly` | boolean | No | Enforce additional verification steps for higher confidence \(default: false\) | +| `forceLanguage` | string | No | Override the detected language with a supported language code | +| `maxSpeed` | boolean | No | Skip time-consuming operations for a faster response \(default: false\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Identification status | +| `brand` | object | Brand data for the identified merchant | +| ↳ `domain` | string | Brand domain | +| ↳ `title` | string | Brand title | +| ↳ `description` | string | Brand description | +| ↳ `slogan` | string | Brand slogan | +| ↳ `colors` | json | Brand colors \(hex and name\) | +| ↳ `logos` | json | Brand logos with mode, colors, resolution, and type | +| ↳ `backdrops` | json | Brand backdrop images | +| ↳ `socials` | json | Social media profiles \(type and url\) | +| ↳ `address` | json | Brand address | +| ↳ `stock` | json | Stock info \(ticker and exchange\) | +| ↳ `is_nsfw` | boolean | Whether the brand contains adult content | +| ↳ `email` | string | Brand contact email | +| ↳ `phone` | string | Brand contact phone | +| ↳ `industries` | json | Industry taxonomy \(eic industry/subindustry pairs\) | +| ↳ `links` | json | Key brand links \(careers, privacy, terms, blog, pricing\) | +| ↳ `primary_language` | string | Primary language of the brand site | + +### `context_dev_prefetch_domain` + +Queue a domain for brand-data prefetching to reduce latency on later requests (subscribers; 0 credits). + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `domain` | string | Yes | The domain to prefetch brand data for \(e.g., "example.com"\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Prefetch status | +| `message` | string | Human-readable prefetch result message | +| `domain` | string | The domain queued for prefetching | + +### `context_dev_prefetch_by_email` + +Queue an email + +#### Input + +| Parameter | Type | Required | Description | +| --------- | ---- | -------- | ----------- | +| `email` | string | Yes | Work email address whose domain should be prefetched \(free providers rejected\) | +| `timeoutMS` | number | No | Request timeout in milliseconds \(1000-300000\) | +| `apiKey` | string | Yes | Context.dev API key | + +#### Output + +| Parameter | Type | Description | +| --------- | ---- | ----------- | +| `status` | string | Prefetch status | +| `message` | string | Human-readable prefetch result message | +| `domain` | string | The domain queued for prefetching | + diff --git a/apps/sim/blocks/blocks/context_dev.ts b/apps/sim/blocks/blocks/context_dev.ts index fead0ac24d3..82a35587ed4 100644 --- a/apps/sim/blocks/blocks/context_dev.ts +++ b/apps/sim/blocks/blocks/context_dev.ts @@ -3,7 +3,56 @@ import type { BlockConfig, BlockMeta } from '@/blocks/types' import { AuthMode, IntegrationType } from '@/blocks/types' import type { ContextDevScrapeMarkdownResponse } from '@/tools/context_dev/types' -const SCRAPE_OPS = ['scrape_markdown', 'scrape_html', 'screenshot', 'crawl', 'extract'] +/** Operations whose primary input is a full page URL. */ +const URL_OPS = [ + 'scrape_markdown', + 'scrape_html', + 'scrape_images', + 'screenshot', + 'crawl', + 'extract', + 'extract_product', +] +/** Operations whose primary input is a bare domain. */ +const DOMAIN_OPS = [ + 'map', + 'get_brand', + 'get_brand_simplified', + 'extract_products', + 'scrape_fonts', + 'scrape_styleguide', + 'prefetch_domain', +] +/** Classification operations keyed on a domain-or-name input. */ +const CLASSIFY_OPS = ['classify_naics', 'classify_sic'] +/** Brand operations that accept language/speed tuning. */ +const BRAND_LANG_OPS = [ + 'get_brand', + 'get_brand_by_name', + 'get_brand_by_email', + 'get_brand_by_ticker', + 'identify_transaction', +] +/** Operations that accept a cache max-age. */ +const MAX_AGE_OPS = [ + 'scrape_markdown', + 'scrape_html', + 'scrape_images', + 'screenshot', + 'crawl', + 'extract', + 'extract_product', + 'extract_products', + 'scrape_fonts', + 'scrape_styleguide', + 'get_brand', + 'get_brand_by_name', + 'get_brand_by_email', + 'get_brand_by_ticker', + 'get_brand_simplified', +] +/** Operations that accept a post-load browser wait. */ +const WAIT_FOR_OPS = ['scrape_markdown', 'scrape_html', 'scrape_images', 'screenshot', 'crawl'] /** * Coerces a value that may be a number or numeric string into a number, or undefined. @@ -39,7 +88,7 @@ export const ContextDevBlock: BlockConfig = { description: 'Scrape, crawl, search, extract, and enrich web and brand data', authMode: AuthMode.ApiKey, longDescription: - 'Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, crawl entire sites, map sitemaps, search the web, extract structured data, classify industries, and retrieve brand assets — all from one API.', + 'Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, list images, crawl entire sites, map sitemaps, search the web, extract structured data and products, pull design systems, classify industries, and retrieve brand assets by domain, name, email, ticker, or transaction — all from one API.', docsLink: 'https://docs.sim.ai/integrations/context_dev', category: 'tools', integrationType: IntegrationType.Search, @@ -53,14 +102,26 @@ export const ContextDevBlock: BlockConfig = { options: [ { label: 'Scrape Markdown', id: 'scrape_markdown' }, { label: 'Scrape HTML', id: 'scrape_html' }, + { label: 'Scrape Images', id: 'scrape_images' }, { label: 'Screenshot', id: 'screenshot' }, { label: 'Crawl Website', id: 'crawl' }, { label: 'Map Sitemap', id: 'map' }, { label: 'Web Search', id: 'search' }, { label: 'Extract Structured Data', id: 'extract' }, + { label: 'Extract Product', id: 'extract_product' }, + { label: 'Extract Products', id: 'extract_products' }, + { label: 'Scrape Fonts', id: 'scrape_fonts' }, + { label: 'Scrape Styleguide', id: 'scrape_styleguide' }, { label: 'Classify NAICS', id: 'classify_naics' }, { label: 'Classify SIC', id: 'classify_sic' }, - { label: 'Get Brand Data', id: 'get_brand' }, + { label: 'Get Brand by Domain', id: 'get_brand' }, + { label: 'Get Brand by Name', id: 'get_brand_by_name' }, + { label: 'Get Brand by Email', id: 'get_brand_by_email' }, + { label: 'Get Brand by Ticker', id: 'get_brand_by_ticker' }, + { label: 'Get Brand (Simplified)', id: 'get_brand_simplified' }, + { label: 'Identify Transaction', id: 'identify_transaction' }, + { label: 'Prefetch Domain', id: 'prefetch_domain' }, + { label: 'Prefetch by Email', id: 'prefetch_by_email' }, ], value: () => 'scrape_markdown', }, @@ -69,24 +130,24 @@ export const ContextDevBlock: BlockConfig = { title: 'Website URL', type: 'short-input', placeholder: 'https://example.com', - condition: { field: 'operation', value: SCRAPE_OPS }, - required: { field: 'operation', value: SCRAPE_OPS }, + condition: { field: 'operation', value: URL_OPS }, + required: { field: 'operation', value: URL_OPS }, }, { id: 'domain', title: 'Domain', type: 'short-input', placeholder: 'example.com', - condition: { field: 'operation', value: ['map', 'get_brand'] }, - required: { field: 'operation', value: ['map', 'get_brand'] }, + condition: { field: 'operation', value: DOMAIN_OPS }, + required: { field: 'operation', value: DOMAIN_OPS }, }, { id: 'input', title: 'Domain or Company Name', type: 'short-input', placeholder: 'example.com or Company Name', - condition: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, - required: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + condition: { field: 'operation', value: CLASSIFY_OPS }, + required: { field: 'operation', value: CLASSIFY_OPS }, }, { id: 'query', @@ -96,6 +157,38 @@ export const ContextDevBlock: BlockConfig = { condition: { field: 'operation', value: 'search' }, required: { field: 'operation', value: 'search' }, }, + { + id: 'name', + title: 'Company Name', + type: 'short-input', + placeholder: 'Apple Inc', + condition: { field: 'operation', value: 'get_brand_by_name' }, + required: { field: 'operation', value: 'get_brand_by_name' }, + }, + { + id: 'email', + title: 'Work Email', + type: 'short-input', + placeholder: 'name@company.com', + condition: { field: 'operation', value: ['get_brand_by_email', 'prefetch_by_email'] }, + required: { field: 'operation', value: ['get_brand_by_email', 'prefetch_by_email'] }, + }, + { + id: 'ticker', + title: 'Stock Ticker', + type: 'short-input', + placeholder: 'AAPL', + condition: { field: 'operation', value: 'get_brand_by_ticker' }, + required: { field: 'operation', value: 'get_brand_by_ticker' }, + }, + { + id: 'transactionInfo', + title: 'Transaction Descriptor', + type: 'short-input', + placeholder: 'SQ *COFFEE SHOP 1234', + condition: { field: 'operation', value: 'identify_transaction' }, + required: { field: 'operation', value: 'identify_transaction' }, + }, { id: 'schema', title: 'Extraction Schema', @@ -163,6 +256,13 @@ Do not include any explanations, markdown formatting, or other text outside the type: 'switch', condition: { field: 'operation', value: 'search' }, }, + { + id: 'tickerExchange', + title: 'Exchange', + type: 'short-input', + placeholder: 'NASDAQ', + condition: { field: 'operation', value: 'get_brand_by_ticker' }, + }, { id: 'sicType', title: 'SIC Taxonomy', @@ -240,6 +340,14 @@ Do not include any explanations, markdown formatting, or other text outside the mode: 'advanced', condition: { field: 'operation', value: ['crawl', 'extract'] }, }, + { + id: 'maxProducts', + title: 'Max Products', + type: 'short-input', + placeholder: '12', + mode: 'advanced', + condition: { field: 'operation', value: 'extract_products' }, + }, { id: 'urlRegex', title: 'URL Regex', @@ -272,13 +380,34 @@ Do not include any explanations, markdown formatting, or other text outside the mode: 'advanced', condition: { field: 'operation', value: 'screenshot' }, }, + { + id: 'enrichResolution', + title: 'Enrich: Resolution', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'scrape_images' }, + }, + { + id: 'enrichHostedUrl', + title: 'Enrich: Hosted URL', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'scrape_images' }, + }, + { + id: 'enrichClassification', + title: 'Enrich: Classification', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'scrape_images' }, + }, { id: 'minResults', title: 'Min Results', type: 'short-input', placeholder: '1', mode: 'advanced', - condition: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + condition: { field: 'operation', value: CLASSIFY_OPS }, }, { id: 'maxResults', @@ -286,7 +415,46 @@ Do not include any explanations, markdown formatting, or other text outside the type: 'short-input', placeholder: '5', mode: 'advanced', - condition: { field: 'operation', value: ['classify_naics', 'classify_sic'] }, + condition: { field: 'operation', value: CLASSIFY_OPS }, + }, + { + id: 'countryGl', + title: 'Country Code', + type: 'short-input', + placeholder: 'us', + mode: 'advanced', + condition: { field: 'operation', value: ['get_brand_by_name', 'identify_transaction'] }, + }, + { + id: 'city', + title: 'City', + type: 'short-input', + placeholder: 'San Francisco', + mode: 'advanced', + condition: { field: 'operation', value: 'identify_transaction' }, + }, + { + id: 'mcc', + title: 'Merchant Category Code', + type: 'short-input', + placeholder: '5812', + mode: 'advanced', + condition: { field: 'operation', value: 'identify_transaction' }, + }, + { + id: 'phone', + title: 'Phone', + type: 'short-input', + placeholder: '14155551234', + mode: 'advanced', + condition: { field: 'operation', value: 'identify_transaction' }, + }, + { + id: 'highConfidenceOnly', + title: 'High Confidence Only', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: 'identify_transaction' }, }, { id: 'forceLanguage', @@ -294,14 +462,14 @@ Do not include any explanations, markdown formatting, or other text outside the type: 'short-input', placeholder: 'e.g., en, es, fr', mode: 'advanced', - condition: { field: 'operation', value: 'get_brand' }, + condition: { field: 'operation', value: BRAND_LANG_OPS }, }, { id: 'maxSpeed', title: 'Max Speed', type: 'switch', mode: 'advanced', - condition: { field: 'operation', value: 'get_brand' }, + condition: { field: 'operation', value: BRAND_LANG_OPS }, }, { id: 'waitForMs', @@ -309,10 +477,7 @@ Do not include any explanations, markdown formatting, or other text outside the type: 'short-input', placeholder: '0', mode: 'advanced', - condition: { - field: 'operation', - value: ['scrape_markdown', 'scrape_html', 'screenshot', 'crawl'], - }, + condition: { field: 'operation', value: WAIT_FOR_OPS }, }, { id: 'stopAfterMs', @@ -328,10 +493,7 @@ Do not include any explanations, markdown formatting, or other text outside the type: 'short-input', placeholder: '86400000', mode: 'advanced', - condition: { - field: 'operation', - value: ['scrape_markdown', 'scrape_html', 'screenshot', 'crawl', 'extract', 'get_brand'], - }, + condition: { field: 'operation', value: MAX_AGE_OPS }, }, { id: 'timeoutMS', @@ -353,14 +515,26 @@ Do not include any explanations, markdown formatting, or other text outside the access: [ 'context_dev_scrape_markdown', 'context_dev_scrape_html', + 'context_dev_scrape_images', 'context_dev_screenshot', 'context_dev_crawl', 'context_dev_map', 'context_dev_search', 'context_dev_extract', + 'context_dev_extract_product', + 'context_dev_extract_products', + 'context_dev_scrape_fonts', + 'context_dev_scrape_styleguide', 'context_dev_classify_naics', 'context_dev_classify_sic', 'context_dev_get_brand', + 'context_dev_get_brand_by_name', + 'context_dev_get_brand_by_email', + 'context_dev_get_brand_by_ticker', + 'context_dev_get_brand_simplified', + 'context_dev_identify_transaction', + 'context_dev_prefetch_domain', + 'context_dev_prefetch_by_email', ], config: { tool: (params) => @@ -397,6 +571,15 @@ Do not include any explanations, markdown formatting, or other text outside the setNumber('waitForMs') setNumber('timeoutMS') break + case 'scrape_images': + setString('url') + setNumber('maxAgeMs') + setNumber('waitForMs') + setNumber('timeoutMS') + setBool('enrichResolution') + setBool('enrichHostedUrl') + setBool('enrichClassification') + break case 'screenshot': setString('url') setBool('fullScreenshot') @@ -462,6 +645,27 @@ Do not include any explanations, markdown formatting, or other text outside the setNumber('timeoutMS') break } + case 'extract_product': + setString('url') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'extract_products': + setString('domain') + setNumber('maxProducts') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'scrape_fonts': + setString('domain') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'scrape_styleguide': + setString('domain') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break case 'classify_naics': setString('input') setNumber('minResults') @@ -482,6 +686,53 @@ Do not include any explanations, markdown formatting, or other text outside the setNumber('maxAgeMs') setNumber('timeoutMS') break + case 'get_brand_by_name': + setString('name') + setString('countryGl') + setString('forceLanguage') + setBool('maxSpeed') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'get_brand_by_email': + setString('email') + setString('forceLanguage') + setBool('maxSpeed') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'get_brand_by_ticker': + setString('ticker') + setString('tickerExchange') + setString('forceLanguage') + setBool('maxSpeed') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'get_brand_simplified': + setString('domain') + setNumber('maxAgeMs') + setNumber('timeoutMS') + break + case 'identify_transaction': + setString('transactionInfo') + setString('countryGl') + setString('city') + setString('mcc') + setNumber('phone') + setBool('highConfidenceOnly') + setString('forceLanguage') + setBool('maxSpeed') + setNumber('timeoutMS') + break + case 'prefetch_domain': + setString('domain') + setNumber('timeoutMS') + break + case 'prefetch_by_email': + setString('email') + setNumber('timeoutMS') + break } return result @@ -491,10 +742,14 @@ Do not include any explanations, markdown formatting, or other text outside the inputs: { apiKey: { type: 'string', description: 'Context.dev API key' }, operation: { type: 'string', description: 'Operation to perform' }, - url: { type: 'string', description: 'Target website URL' }, + url: { type: 'string', description: 'Target website or page URL' }, domain: { type: 'string', description: 'Target domain' }, input: { type: 'string', description: 'Domain or company name for classification' }, query: { type: 'string', description: 'Web search query' }, + name: { type: 'string', description: 'Company name for brand lookup' }, + email: { type: 'string', description: 'Work email for brand lookup or prefetch' }, + ticker: { type: 'string', description: 'Stock ticker for brand lookup' }, + transactionInfo: { type: 'string', description: 'Transaction descriptor to identify' }, schema: { type: 'json', description: 'JSON schema for structured extraction' }, instructions: { type: 'string', description: 'Extraction guidance' }, useMainContentOnly: { type: 'boolean', description: 'Return only main content' }, @@ -503,6 +758,7 @@ Do not include any explanations, markdown formatting, or other text outside the fullScreenshot: { type: 'boolean', description: 'Capture the full page' }, handleCookiePopup: { type: 'boolean', description: 'Dismiss cookie banners' }, markdownEnabled: { type: 'boolean', description: 'Scrape search results to markdown' }, + tickerExchange: { type: 'string', description: 'Stock exchange for the ticker' }, sicType: { type: 'string', description: 'SIC taxonomy version' }, freshness: { type: 'string', description: 'Search recency filter' }, includeDomains: { type: 'json', description: 'Domains to allowlist in search' }, @@ -512,12 +768,21 @@ Do not include any explanations, markdown formatting, or other text outside the followSubdomains: { type: 'boolean', description: 'Follow subdomain links' }, maxPages: { type: 'number', description: 'Maximum pages to process' }, maxDepth: { type: 'number', description: 'Maximum link depth' }, + maxProducts: { type: 'number', description: 'Maximum products to extract' }, urlRegex: { type: 'string', description: 'Regex to filter URLs' }, maxLinks: { type: 'number', description: 'Maximum sitemap URLs' }, viewportWidth: { type: 'number', description: 'Screenshot viewport width' }, viewportHeight: { type: 'number', description: 'Screenshot viewport height' }, + enrichResolution: { type: 'boolean', description: 'Measure scraped image dimensions' }, + enrichHostedUrl: { type: 'boolean', description: 'Host scraped images and return URLs' }, + enrichClassification: { type: 'boolean', description: 'Classify scraped images by type' }, minResults: { type: 'number', description: 'Minimum classification results' }, maxResults: { type: 'number', description: 'Maximum classification results' }, + countryGl: { type: 'string', description: 'ISO country code hint' }, + city: { type: 'string', description: 'City hint for transaction lookup' }, + mcc: { type: 'string', description: 'Merchant category code' }, + phone: { type: 'number', description: 'Phone number from transaction' }, + highConfidenceOnly: { type: 'boolean', description: 'Require high-confidence match' }, forceLanguage: { type: 'string', description: 'Override detected brand language' }, maxSpeed: { type: 'boolean', description: 'Skip slow brand operations' }, waitForMs: { type: 'number', description: 'Browser wait time in ms' }, @@ -536,14 +801,24 @@ Do not include any explanations, markdown formatting, or other text outside the domain: { type: 'string', description: 'Resolved domain' }, width: { type: 'number', description: 'Screenshot width in pixels' }, height: { type: 'number', description: 'Screenshot height in pixels' }, + success: { type: 'boolean', description: 'Whether the scrape succeeded' }, + images: { type: 'json', description: 'Discovered image assets' }, results: { type: 'json', description: 'Crawl pages or search results' }, metadata: { type: 'json', description: 'Crawl or extraction summary metadata' }, urls: { type: 'json', description: 'Discovered sitemap URLs' }, meta: { type: 'json', description: 'Sitemap discovery stats' }, query: { type: 'string', description: 'The query that was searched' }, status: { type: 'string', description: 'Operation status' }, + message: { type: 'string', description: 'Prefetch result message' }, urlsAnalyzed: { type: 'json', description: 'URLs analyzed during extraction' }, data: { type: 'json', description: 'Structured data extracted from the site' }, + isProductPage: { type: 'boolean', description: 'Whether the URL is a product page' }, + platform: { type: 'string', description: 'Detected commerce platform' }, + product: { type: 'json', description: 'Extracted single product details' }, + products: { type: 'json', description: 'Extracted product catalog' }, + fonts: { type: 'json', description: 'Fonts with usage statistics' }, + fontLinks: { type: 'json', description: 'Font family download links' }, + styleguide: { type: 'json', description: 'Design system (colors, typography, components)' }, codes: { type: 'json', description: 'Matched industry classification codes' }, classification: { type: 'string', description: 'SIC taxonomy version used' }, brand: { type: 'json', description: 'Brand data (logos, colors, socials, industry)' }, @@ -579,7 +854,7 @@ export const ContextDevBlockMeta = { icon: ContextDevIcon, title: 'Context.dev lead enrichment', prompt: - 'Create a workflow that takes a work email or domain, uses Context.dev to retrieve brand data and classify the company into NAICS codes, and writes the enriched firmographics to a CRM record.', + 'Create a workflow that takes a work email, uses Context.dev to retrieve brand data by email and classify the company into NAICS codes, and writes the enriched firmographics to a CRM record.', modules: ['agent', 'tables', 'workflows'], category: 'sales', tags: ['enrichment', 'sales'], @@ -604,18 +879,27 @@ export const ContextDevBlockMeta = { }, { icon: ContextDevIcon, - title: 'Context.dev brand asset fetcher', + title: 'Context.dev design-system extractor', prompt: - 'Build a workflow that takes a domain, uses Context.dev to retrieve the brand logos, colors, and a homepage screenshot, and stores the assets as files for a design handoff.', + 'Build a workflow that takes a domain, uses Context.dev to scrape its styleguide and fonts plus a homepage screenshot, and stores the design tokens and assets as files for a design handoff.', modules: ['agent', 'files', 'workflows'], - category: 'marketing', - tags: ['marketing', 'enrichment'], + category: 'engineering', + tags: ['design', 'research'], }, { icon: ContextDevIcon, title: 'Context.dev transaction enrichment', prompt: - 'Create a workflow that classifies a list of company domains into SIC and NAICS industry codes with Context.dev and appends the codes to a table for downstream reporting.', + 'Create a workflow that takes raw bank transaction descriptors, uses Context.dev to identify the merchant brand behind each one, and appends the resolved company and logo to a table.', + modules: ['tables', 'agent', 'workflows'], + category: 'operations', + tags: ['enrichment', 'automation'], + }, + { + icon: ContextDevIcon, + title: 'Context.dev product catalog importer', + prompt: + "Build a workflow that takes a brand domain, uses Context.dev to extract the brand's product catalog with pricing and features, and writes each product as a row in a table.", modules: ['tables', 'agent', 'workflows'], category: 'operations', tags: ['enrichment', 'automation'], diff --git a/apps/sim/lib/integrations/integrations.json b/apps/sim/lib/integrations/integrations.json index b66f26b25b9..219fed3b77c 100644 --- a/apps/sim/lib/integrations/integrations.json +++ b/apps/sim/lib/integrations/integrations.json @@ -3341,7 +3341,7 @@ "slug": "context-dev", "name": "Context.dev", "description": "Scrape, crawl, search, extract, and enrich web and brand data", - "longDescription": "Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, crawl entire sites, map sitemaps, search the web, extract structured data, classify industries, and retrieve brand assets — all from one API.", + "longDescription": "Integrate Context.dev into the workflow. Scrape pages to markdown or HTML, capture screenshots, list images, crawl entire sites, map sitemaps, search the web, extract structured data and products, pull design systems, classify industries, and retrieve brand assets by domain, name, email, ticker, or transaction — all from one API.", "bgColor": "#ffffff", "iconName": "ContextDevIcon", "docsUrl": "https://docs.sim.ai/integrations/context_dev", @@ -3354,6 +3354,10 @@ "name": "Scrape HTML", "description": "Scrape any URL and return the raw HTML content of the page." }, + { + "name": "Scrape Images", + "description": "Discover every image asset on a page, with optional dimension and type enrichment." + }, { "name": "Screenshot", "description": "Capture a screenshot of any web page and store it as a downloadable image file." @@ -3374,6 +3378,22 @@ "name": "Extract Structured Data", "description": "Crawl a website and extract structured data matching a provided JSON schema." }, + { + "name": "Extract Product", + "description": "Detect and extract structured product details from a single product page URL." + }, + { + "name": "Extract Products", + "description": "Extract the product catalog from a brand" + }, + { + "name": "Scrape Fonts", + "description": "Extract the font families, usage stats, and font files used by a domain." + }, + { + "name": "Scrape Styleguide", + "description": "Extract a domain" + }, { "name": "Classify NAICS", "description": "Classify a brand into NAICS industry codes from its domain or company name." @@ -3383,11 +3403,39 @@ "description": "Classify a brand into SIC industry codes from its domain or company name." }, { - "name": "Get Brand Data", + "name": "Get Brand by Domain", "description": "Retrieve brand data for a domain: logos, colors, backdrops, socials, address, and industry." + }, + { + "name": "Get Brand by Name", + "description": "Retrieve brand data by company name: logos, colors, socials, address, and industry." + }, + { + "name": "Get Brand by Email", + "description": "Retrieve brand data from a work email address. Free/disposable emails are rejected (422)." + }, + { + "name": "Get Brand by Ticker", + "description": "Retrieve brand data for a public company by its stock ticker symbol." + }, + { + "name": "Get Brand (Simplified)", + "description": "Retrieve essential brand data for a domain: title, colors, logos, and backdrops." + }, + { + "name": "Identify Transaction", + "description": "Identify the brand behind a raw bank/card transaction descriptor and return its brand data." + }, + { + "name": "Prefetch Domain", + "description": "Queue a domain for brand-data prefetching to reduce latency on later requests (subscribers; 0 credits)." + }, + { + "name": "Prefetch by Email", + "description": "Queue an email" } ], - "operationCount": 10, + "operationCount": 22, "triggers": [], "triggerCount": 0, "authType": "api-key", diff --git a/apps/sim/tools/context_dev/extract_product.ts b/apps/sim/tools/context_dev/extract_product.ts new file mode 100644 index 00000000000..a70f39fa96f --- /dev/null +++ b/apps/sim/tools/context_dev/extract_product.ts @@ -0,0 +1,90 @@ +import type { + ContextDevExtractProductParams, + ContextDevExtractProductResponse, +} from '@/tools/context_dev/types' +import { PRODUCT_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevExtractProductTool: ToolConfig< + ContextDevExtractProductParams, + ContextDevExtractProductResponse +> = { + id: 'context_dev_extract_product', + name: 'Context.dev Extract Product', + description: 'Detect and extract structured product details from a single product page URL.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The product page URL (must include http:// or https://)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 604800000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/brand/ai/product`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { url: params.url } + if (params.maxAgeMs != null) body.maxAgeMs = params.maxAgeMs + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + isProductPage: data.is_product_page ?? false, + platform: data.platform ?? null, + product: data.product ?? null, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + isProductPage: { type: 'boolean', description: 'Whether the URL is a product page' }, + platform: { + type: 'string', + description: 'Detected platform (amazon, tiktok_shop, etsy, generic)', + optional: true, + }, + product: { + type: 'object', + description: 'Extracted product details', + properties: PRODUCT_OUTPUT_PROPERTIES, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/extract_products.ts b/apps/sim/tools/context_dev/extract_products.ts new file mode 100644 index 00000000000..aa22102a446 --- /dev/null +++ b/apps/sim/tools/context_dev/extract_products.ts @@ -0,0 +1,89 @@ +import type { + ContextDevExtractProductsParams, + ContextDevExtractProductsResponse, +} from '@/tools/context_dev/types' +import { PRODUCT_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevExtractProductsTool: ToolConfig< + ContextDevExtractProductsParams, + ContextDevExtractProductsResponse +> = { + id: 'context_dev_extract_products', + name: 'Context.dev Extract Products', + description: "Extract the product catalog from a brand's website by domain (beta).", + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to extract products from (e.g., "example.com")', + }, + maxProducts: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Maximum number of products to extract (1-12)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 604800000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/brand/ai/products`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { domain: params.domain } + if (params.maxProducts != null) body.maxProducts = params.maxProducts + if (params.maxAgeMs != null) body.maxAgeMs = params.maxAgeMs + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + products: data.products ?? [], + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + products: { + type: 'array', + description: 'Extracted products with pricing, features, and metadata', + items: { type: 'object', properties: PRODUCT_OUTPUT_PROPERTIES }, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/get_brand.ts b/apps/sim/tools/context_dev/get_brand.ts index 26a66596511..daffe65f08e 100644 --- a/apps/sim/tools/context_dev/get_brand.ts +++ b/apps/sim/tools/context_dev/get_brand.ts @@ -1,116 +1,88 @@ -import type { - ContextDevGetBrandParams, - ContextDevGetBrandResponse, -} from '@/tools/context_dev/types' +import type { ContextDevBrandResponse, ContextDevGetBrandParams } from '@/tools/context_dev/types' +import { BRAND_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' import { appendParam, CONTEXT_DEV_BASE_URL, CREDIT_OUTPUTS, contextDevHeaders, - extractCreditMetadata, parseContextDevResponse, + transformBrandResponse, } from '@/tools/context_dev/utils' import type { ToolConfig } from '@/tools/types' -export const contextDevGetBrandTool: ToolConfig< - ContextDevGetBrandParams, - ContextDevGetBrandResponse -> = { - id: 'context_dev_get_brand', - name: 'Context.dev Get Brand', - description: - 'Retrieve brand data for a domain: logos, colors, backdrops, socials, address, and industry.', - version: '1.0.0', +export const contextDevGetBrandTool: ToolConfig = + { + id: 'context_dev_get_brand', + name: 'Context.dev Get Brand', + description: + 'Retrieve brand data for a domain: logos, colors, backdrops, socials, address, and industry.', + version: '1.0.0', - params: { - domain: { - type: 'string', - required: true, - visibility: 'user-or-llm', - description: 'The domain to retrieve brand data for (e.g., "airbnb.com")', - }, - forceLanguage: { - type: 'string', - required: false, - visibility: 'user-or-llm', - description: 'Override the detected language with a supported language code', - }, - maxSpeed: { - type: 'boolean', - required: false, - visibility: 'user-or-llm', - description: 'Skip time-consuming operations for a faster response (default: false)', - }, - maxAgeMs: { - type: 'number', - required: false, - visibility: 'user-or-llm', - description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', - }, - timeoutMS: { - type: 'number', - required: false, - visibility: 'user-or-llm', - description: 'Request timeout in milliseconds (1000-300000)', - }, - apiKey: { - type: 'string', - required: true, - visibility: 'user-only', - description: 'Context.dev API key', + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to retrieve brand data for (e.g., "airbnb.com")', + }, + forceLanguage: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Override the detected language with a supported language code', + }, + maxSpeed: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Skip time-consuming operations for a faster response (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, }, - }, - request: { - method: 'GET', - url: (params) => { - const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve`) - appendParam(url.searchParams, 'domain', params.domain) - appendParam(url.searchParams, 'force_language', params.forceLanguage) - appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) - appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) - appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) - return url.toString() + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve`) + appendParam(url.searchParams, 'domain', params.domain) + appendParam(url.searchParams, 'force_language', params.forceLanguage) + appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), }, - headers: (params) => contextDevHeaders(params.apiKey), - }, - transformResponse: async (response: Response) => { - const data = await parseContextDevResponse(response) - return { - success: true, - output: { - status: data.status ?? '', - brand: data.brand ?? null, - ...extractCreditMetadata(data.key_metadata), - }, - } - }, + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { success: true, output: transformBrandResponse(data) } + }, - outputs: { - status: { type: 'string', description: 'Retrieval status' }, - brand: { - type: 'object', - description: 'Brand data object', - properties: { - domain: { type: 'string', description: 'Brand domain' }, - title: { type: 'string', description: 'Brand title' }, - description: { type: 'string', description: 'Brand description' }, - slogan: { type: 'string', description: 'Brand slogan' }, - colors: { type: 'json', description: 'Brand colors (hex and name)' }, - logos: { type: 'json', description: 'Brand logos with mode, colors, resolution, and type' }, - backdrops: { type: 'json', description: 'Brand backdrop images' }, - socials: { type: 'json', description: 'Social media profiles (type and url)' }, - address: { type: 'json', description: 'Brand address' }, - stock: { type: 'json', description: 'Stock info (ticker and exchange)' }, - is_nsfw: { type: 'boolean', description: 'Whether the brand contains adult content' }, - email: { type: 'string', description: 'Brand contact email' }, - phone: { type: 'string', description: 'Brand contact phone' }, - industries: { type: 'json', description: 'Industry taxonomy (eic pairs)' }, - links: { type: 'json', description: 'Key brand links (careers, privacy, terms, etc.)' }, - primary_language: { type: 'string', description: 'Primary language of the brand site' }, + outputs: { + status: { type: 'string', description: 'Retrieval status' }, + brand: { + type: 'object', + description: 'Brand data object', + properties: BRAND_OUTPUT_PROPERTIES, }, + ...CREDIT_OUTPUTS, }, - ...CREDIT_OUTPUTS, - }, -} + } diff --git a/apps/sim/tools/context_dev/get_brand_by_email.ts b/apps/sim/tools/context_dev/get_brand_by_email.ts new file mode 100644 index 00000000000..e93fbacdfdf --- /dev/null +++ b/apps/sim/tools/context_dev/get_brand_by_email.ts @@ -0,0 +1,93 @@ +import type { + ContextDevBrandResponse, + ContextDevGetBrandByEmailParams, +} from '@/tools/context_dev/types' +import { BRAND_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + parseContextDevResponse, + transformBrandResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevGetBrandByEmailTool: ToolConfig< + ContextDevGetBrandByEmailParams, + ContextDevBrandResponse +> = { + id: 'context_dev_get_brand_by_email', + name: 'Context.dev Get Brand by Email', + description: + 'Retrieve brand data from a work email address. Free/disposable emails are rejected (422).', + version: '1.0.0', + + params: { + email: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Work email address; the domain is extracted (free providers are rejected)', + }, + forceLanguage: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Override the detected language with a supported language code', + }, + maxSpeed: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Skip time-consuming operations for a faster response (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve-by-email`) + appendParam(url.searchParams, 'email', params.email) + appendParam(url.searchParams, 'force_language', params.forceLanguage) + appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { success: true, output: transformBrandResponse(data) } + }, + + outputs: { + status: { type: 'string', description: 'Retrieval status' }, + brand: { + type: 'object', + description: 'Brand data object', + properties: BRAND_OUTPUT_PROPERTIES, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/get_brand_by_name.ts b/apps/sim/tools/context_dev/get_brand_by_name.ts new file mode 100644 index 00000000000..78f8970e49e --- /dev/null +++ b/apps/sim/tools/context_dev/get_brand_by_name.ts @@ -0,0 +1,100 @@ +import type { + ContextDevBrandResponse, + ContextDevGetBrandByNameParams, +} from '@/tools/context_dev/types' +import { BRAND_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + parseContextDevResponse, + transformBrandResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevGetBrandByNameTool: ToolConfig< + ContextDevGetBrandByNameParams, + ContextDevBrandResponse +> = { + id: 'context_dev_get_brand_by_name', + name: 'Context.dev Get Brand by Name', + description: + 'Retrieve brand data by company name: logos, colors, socials, address, and industry.', + version: '1.0.0', + + params: { + name: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Company name to retrieve brand data for (3-30 chars, e.g., "Apple Inc")', + }, + countryGl: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'ISO 2-letter country code to prioritize (e.g., "us")', + }, + forceLanguage: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Override the detected language with a supported language code', + }, + maxSpeed: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Skip time-consuming operations for a faster response (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve-by-name`) + appendParam(url.searchParams, 'name', params.name) + appendParam(url.searchParams, 'country_gl', params.countryGl) + appendParam(url.searchParams, 'force_language', params.forceLanguage) + appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { success: true, output: transformBrandResponse(data) } + }, + + outputs: { + status: { type: 'string', description: 'Retrieval status' }, + brand: { + type: 'object', + description: 'Brand data object', + properties: BRAND_OUTPUT_PROPERTIES, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/get_brand_by_ticker.ts b/apps/sim/tools/context_dev/get_brand_by_ticker.ts new file mode 100644 index 00000000000..ff0d92b4018 --- /dev/null +++ b/apps/sim/tools/context_dev/get_brand_by_ticker.ts @@ -0,0 +1,99 @@ +import type { + ContextDevBrandResponse, + ContextDevGetBrandByTickerParams, +} from '@/tools/context_dev/types' +import { BRAND_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + parseContextDevResponse, + transformBrandResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevGetBrandByTickerTool: ToolConfig< + ContextDevGetBrandByTickerParams, + ContextDevBrandResponse +> = { + id: 'context_dev_get_brand_by_ticker', + name: 'Context.dev Get Brand by Ticker', + description: 'Retrieve brand data for a public company by its stock ticker symbol.', + version: '1.0.0', + + params: { + ticker: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Stock ticker symbol (e.g., "AAPL", "GOOGL", "BRK.A")', + }, + tickerExchange: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Exchange code for the ticker (e.g., "NASDAQ", "NYSE", "LSE"). Default: NASDAQ', + }, + forceLanguage: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Override the detected language with a supported language code', + }, + maxSpeed: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Skip time-consuming operations for a faster response (default: false)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve-by-ticker`) + appendParam(url.searchParams, 'ticker', params.ticker) + appendParam(url.searchParams, 'ticker_exchange', params.tickerExchange) + appendParam(url.searchParams, 'force_language', params.forceLanguage) + appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { success: true, output: transformBrandResponse(data) } + }, + + outputs: { + status: { type: 'string', description: 'Retrieval status' }, + brand: { + type: 'object', + description: 'Brand data object', + properties: BRAND_OUTPUT_PROPERTIES, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/get_brand_simplified.ts b/apps/sim/tools/context_dev/get_brand_simplified.ts new file mode 100644 index 00000000000..f120104139c --- /dev/null +++ b/apps/sim/tools/context_dev/get_brand_simplified.ts @@ -0,0 +1,78 @@ +import type { + ContextDevGetBrandSimplifiedParams, + ContextDevGetBrandSimplifiedResponse, +} from '@/tools/context_dev/types' +import { SIMPLIFIED_BRAND_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + parseContextDevResponse, + transformBrandResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevGetBrandSimplifiedTool: ToolConfig< + ContextDevGetBrandSimplifiedParams, + ContextDevGetBrandSimplifiedResponse +> = { + id: 'context_dev_get_brand_simplified', + name: 'Context.dev Get Brand (Simplified)', + description: 'Retrieve essential brand data for a domain: title, colors, logos, and backdrops.', + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to retrieve simplified brand data for (e.g., "airbnb.com")', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/retrieve-simplified`) + appendParam(url.searchParams, 'domain', params.domain) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { success: true, output: transformBrandResponse(data) } + }, + + outputs: { + status: { type: 'string', description: 'Retrieval status' }, + brand: { + type: 'object', + description: 'Simplified brand data (domain, title, colors, logos, backdrops)', + properties: SIMPLIFIED_BRAND_OUTPUT_PROPERTIES, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/identify_transaction.ts b/apps/sim/tools/context_dev/identify_transaction.ts new file mode 100644 index 00000000000..47cba312dce --- /dev/null +++ b/apps/sim/tools/context_dev/identify_transaction.ts @@ -0,0 +1,121 @@ +import type { + ContextDevBrandResponse, + ContextDevIdentifyTransactionParams, +} from '@/tools/context_dev/types' +import { BRAND_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + parseContextDevResponse, + transformBrandResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevIdentifyTransactionTool: ToolConfig< + ContextDevIdentifyTransactionParams, + ContextDevBrandResponse +> = { + id: 'context_dev_identify_transaction', + name: 'Context.dev Identify Transaction', + description: + 'Identify the brand behind a raw bank/card transaction descriptor and return its brand data.', + version: '1.0.0', + + params: { + transactionInfo: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The raw transaction descriptor or identifier to resolve to a brand', + }, + countryGl: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'ISO 2-letter country code from the transaction (e.g., "us", "gb")', + }, + city: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'City name to prioritize in the search', + }, + mcc: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Merchant Category Code for the business category', + }, + phone: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Phone number from the transaction for verification', + }, + highConfidenceOnly: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Enforce additional verification steps for higher confidence (default: false)', + }, + forceLanguage: { + type: 'string', + required: false, + visibility: 'user-or-llm', + description: 'Override the detected language with a supported language code', + }, + maxSpeed: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Skip time-consuming operations for a faster response (default: false)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/brand/transaction_identifier`) + appendParam(url.searchParams, 'transaction_info', params.transactionInfo) + appendParam(url.searchParams, 'country_gl', params.countryGl) + appendParam(url.searchParams, 'city', params.city) + appendParam(url.searchParams, 'mcc', params.mcc) + appendParam(url.searchParams, 'phone', params.phone) + appendParam(url.searchParams, 'high_confidence_only', params.highConfidenceOnly) + appendParam(url.searchParams, 'force_language', params.forceLanguage) + appendParam(url.searchParams, 'maxSpeed', params.maxSpeed) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { success: true, output: transformBrandResponse(data) } + }, + + outputs: { + status: { type: 'string', description: 'Identification status' }, + brand: { + type: 'object', + description: 'Brand data for the identified merchant', + properties: BRAND_OUTPUT_PROPERTIES, + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/index.ts b/apps/sim/tools/context_dev/index.ts index 1fb08c8881d..9848723ebf3 100644 --- a/apps/sim/tools/context_dev/index.ts +++ b/apps/sim/tools/context_dev/index.ts @@ -2,9 +2,21 @@ export { contextDevClassifyNaicsTool } from '@/tools/context_dev/classify_naics' export { contextDevClassifySicTool } from '@/tools/context_dev/classify_sic' export { contextDevCrawlTool } from '@/tools/context_dev/crawl' export { contextDevExtractTool } from '@/tools/context_dev/extract' +export { contextDevExtractProductTool } from '@/tools/context_dev/extract_product' +export { contextDevExtractProductsTool } from '@/tools/context_dev/extract_products' export { contextDevGetBrandTool } from '@/tools/context_dev/get_brand' +export { contextDevGetBrandByEmailTool } from '@/tools/context_dev/get_brand_by_email' +export { contextDevGetBrandByNameTool } from '@/tools/context_dev/get_brand_by_name' +export { contextDevGetBrandByTickerTool } from '@/tools/context_dev/get_brand_by_ticker' +export { contextDevGetBrandSimplifiedTool } from '@/tools/context_dev/get_brand_simplified' +export { contextDevIdentifyTransactionTool } from '@/tools/context_dev/identify_transaction' export { contextDevMapTool } from '@/tools/context_dev/map' +export { contextDevPrefetchByEmailTool } from '@/tools/context_dev/prefetch_by_email' +export { contextDevPrefetchDomainTool } from '@/tools/context_dev/prefetch_domain' +export { contextDevScrapeFontsTool } from '@/tools/context_dev/scrape_fonts' export { contextDevScrapeHtmlTool } from '@/tools/context_dev/scrape_html' +export { contextDevScrapeImagesTool } from '@/tools/context_dev/scrape_images' export { contextDevScrapeMarkdownTool } from '@/tools/context_dev/scrape_markdown' +export { contextDevScrapeStyleguideTool } from '@/tools/context_dev/scrape_styleguide' export { contextDevScreenshotTool } from '@/tools/context_dev/screenshot' export { contextDevSearchTool } from '@/tools/context_dev/search' diff --git a/apps/sim/tools/context_dev/prefetch_by_email.ts b/apps/sim/tools/context_dev/prefetch_by_email.ts new file mode 100644 index 00000000000..af0ed013cd6 --- /dev/null +++ b/apps/sim/tools/context_dev/prefetch_by_email.ts @@ -0,0 +1,75 @@ +import type { + ContextDevPrefetchByEmailParams, + ContextDevPrefetchResponse, +} from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevPrefetchByEmailTool: ToolConfig< + ContextDevPrefetchByEmailParams, + ContextDevPrefetchResponse +> = { + id: 'context_dev_prefetch_by_email', + name: 'Context.dev Prefetch by Email', + description: + "Queue an email's domain for brand-data prefetching to reduce later latency (subscribers; 0 credits). Free/disposable emails are rejected.", + version: '1.0.0', + + params: { + email: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'Work email address whose domain should be prefetched (free providers rejected)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/brand/prefetch-by-email`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { email: params.email } + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + message: data.message ?? '', + domain: data.domain ?? '', + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Prefetch status' }, + message: { type: 'string', description: 'Human-readable prefetch result message' }, + domain: { type: 'string', description: 'The domain queued for prefetching' }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/prefetch_domain.ts b/apps/sim/tools/context_dev/prefetch_domain.ts new file mode 100644 index 00000000000..77d33d08520 --- /dev/null +++ b/apps/sim/tools/context_dev/prefetch_domain.ts @@ -0,0 +1,75 @@ +import type { + ContextDevPrefetchDomainParams, + ContextDevPrefetchResponse, +} from '@/tools/context_dev/types' +import { + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevJsonHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevPrefetchDomainTool: ToolConfig< + ContextDevPrefetchDomainParams, + ContextDevPrefetchResponse +> = { + id: 'context_dev_prefetch_domain', + name: 'Context.dev Prefetch Domain', + description: + 'Queue a domain for brand-data prefetching to reduce latency on later requests (subscribers; 0 credits).', + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to prefetch brand data for (e.g., "example.com")', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'POST', + url: () => `${CONTEXT_DEV_BASE_URL}/brand/prefetch`, + headers: (params) => contextDevJsonHeaders(params.apiKey), + body: (params) => { + const body: Record = { domain: params.domain } + if (params.timeoutMS != null) body.timeoutMS = params.timeoutMS + return body + }, + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + message: data.message ?? '', + domain: data.domain ?? '', + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Prefetch status' }, + message: { type: 'string', description: 'Human-readable prefetch result message' }, + domain: { type: 'string', description: 'The domain queued for prefetching' }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/scrape_fonts.ts b/apps/sim/tools/context_dev/scrape_fonts.ts new file mode 100644 index 00000000000..c64c676bf9e --- /dev/null +++ b/apps/sim/tools/context_dev/scrape_fonts.ts @@ -0,0 +1,92 @@ +import type { + ContextDevScrapeFontsParams, + ContextDevScrapeFontsResponse, +} from '@/tools/context_dev/types' +import { FONT_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevScrapeFontsTool: ToolConfig< + ContextDevScrapeFontsParams, + ContextDevScrapeFontsResponse +> = { + id: 'context_dev_scrape_fonts', + name: 'Context.dev Scrape Fonts', + description: 'Extract the font families, usage stats, and font files used by a domain.', + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to extract fonts from (e.g., "example.com")', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/fonts`) + appendParam(url.searchParams, 'domain', params.domain) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + domain: data.domain ?? '', + fonts: data.fonts ?? [], + fontLinks: data.fontLinks ?? {}, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Extraction status' }, + domain: { type: 'string', description: 'The domain that was analyzed' }, + fonts: { + type: 'array', + description: 'Fonts with usage statistics and fallbacks', + items: { type: 'object', properties: FONT_OUTPUT_PROPERTIES }, + }, + fontLinks: { + type: 'json', + description: 'Font family download links keyed by font name (type, files, category)', + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/scrape_images.ts b/apps/sim/tools/context_dev/scrape_images.ts new file mode 100644 index 00000000000..d2dfade8acb --- /dev/null +++ b/apps/sim/tools/context_dev/scrape_images.ts @@ -0,0 +1,115 @@ +import type { + ContextDevScrapeImagesParams, + ContextDevScrapeImagesResponse, +} from '@/tools/context_dev/types' +import { IMAGE_OUTPUT_PROPERTIES } from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevScrapeImagesTool: ToolConfig< + ContextDevScrapeImagesParams, + ContextDevScrapeImagesResponse +> = { + id: 'context_dev_scrape_images', + name: 'Context.dev Scrape Images', + description: 'Discover every image asset on a page, with optional dimension and type enrichment.', + version: '1.0.0', + + params: { + url: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The full URL to scrape images from (must include http:// or https://)', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache duration in milliseconds (0-2592000000, default: 86400000)', + }, + waitForMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Browser wait time after page load in milliseconds (0-30000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + enrichResolution: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Measure image dimensions (enables 5-credit enrichment)', + }, + enrichHostedUrl: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Host images on a CDN and return their URL and MIME type (enables enrichment)', + }, + enrichClassification: { + type: 'boolean', + required: false, + visibility: 'user-or-llm', + description: 'Classify each image by visual asset type (enables enrichment)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/scrape/images`) + appendParam(url.searchParams, 'url', params.url) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'waitForMs', params.waitForMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + appendParam(url.searchParams, 'enrichment[resolution]', params.enrichResolution) + appendParam(url.searchParams, 'enrichment[hostedUrl]', params.enrichHostedUrl) + appendParam(url.searchParams, 'enrichment[classification]', params.enrichClassification) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + success: data.success ?? true, + images: data.images ?? [], + url: data.url ?? '', + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + success: { type: 'boolean', description: 'Whether the scrape succeeded' }, + images: { + type: 'array', + description: 'Discovered image assets with source, element, type, and optional enrichment', + items: { type: 'object', properties: IMAGE_OUTPUT_PROPERTIES }, + }, + url: { type: 'string', description: 'The scraped URL' }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/scrape_styleguide.ts b/apps/sim/tools/context_dev/scrape_styleguide.ts new file mode 100644 index 00000000000..3be34c35d44 --- /dev/null +++ b/apps/sim/tools/context_dev/scrape_styleguide.ts @@ -0,0 +1,87 @@ +import type { + ContextDevScrapeStyleguideParams, + ContextDevScrapeStyleguideResponse, +} from '@/tools/context_dev/types' +import { + appendParam, + CONTEXT_DEV_BASE_URL, + CREDIT_OUTPUTS, + contextDevHeaders, + extractCreditMetadata, + parseContextDevResponse, +} from '@/tools/context_dev/utils' +import type { ToolConfig } from '@/tools/types' + +export const contextDevScrapeStyleguideTool: ToolConfig< + ContextDevScrapeStyleguideParams, + ContextDevScrapeStyleguideResponse +> = { + id: 'context_dev_scrape_styleguide', + name: 'Context.dev Scrape Styleguide', + description: + "Extract a domain's design system: colors, typography, spacing, shadows, and UI components.", + version: '1.0.0', + + params: { + domain: { + type: 'string', + required: true, + visibility: 'user-or-llm', + description: 'The domain to extract the styleguide from (e.g., "example.com")', + }, + maxAgeMs: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Cache max age in milliseconds (86400000-31536000000, default: 7776000000)', + }, + timeoutMS: { + type: 'number', + required: false, + visibility: 'user-or-llm', + description: 'Request timeout in milliseconds (1000-300000)', + }, + apiKey: { + type: 'string', + required: true, + visibility: 'user-only', + description: 'Context.dev API key', + }, + }, + + request: { + method: 'GET', + url: (params) => { + const url = new URL(`${CONTEXT_DEV_BASE_URL}/web/styleguide`) + appendParam(url.searchParams, 'domain', params.domain) + appendParam(url.searchParams, 'maxAgeMs', params.maxAgeMs) + appendParam(url.searchParams, 'timeoutMS', params.timeoutMS) + return url.toString() + }, + headers: (params) => contextDevHeaders(params.apiKey), + }, + + transformResponse: async (response: Response) => { + const data = await parseContextDevResponse(response) + return { + success: true, + output: { + status: data.status ?? '', + domain: data.domain ?? '', + styleguide: data.styleguide ?? null, + ...extractCreditMetadata(data.key_metadata), + }, + } + }, + + outputs: { + status: { type: 'string', description: 'Extraction status' }, + domain: { type: 'string', description: 'The domain that was analyzed' }, + styleguide: { + type: 'json', + description: + 'Design system: mode, colors, typography, elementSpacing, shadows, fontLinks, components', + }, + ...CREDIT_OUTPUTS, + }, +} diff --git a/apps/sim/tools/context_dev/types.ts b/apps/sim/tools/context_dev/types.ts index 4e7d1734318..63cde60c786 100644 --- a/apps/sim/tools/context_dev/types.ts +++ b/apps/sim/tools/context_dev/types.ts @@ -66,6 +66,25 @@ export interface ContextDevScreenshotResponse extends ToolResponse { } } +export interface ContextDevScrapeImagesParams { + apiKey: string + url: string + maxAgeMs?: number + waitForMs?: number + timeoutMS?: number + enrichResolution?: boolean + enrichHostedUrl?: boolean + enrichClassification?: boolean +} + +export interface ContextDevScrapeImagesResponse extends ToolResponse { + output: CreditFields & { + success: boolean + images: Array> + url: string + } +} + export interface ContextDevCrawlParams { apiKey: string url: string @@ -150,6 +169,66 @@ export interface ContextDevExtractResponse extends ToolResponse { } } +export interface ContextDevExtractProductParams { + apiKey: string + url: string + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevExtractProductResponse extends ToolResponse { + output: CreditFields & { + isProductPage: boolean + platform: string | null + product: Record | null + } +} + +export interface ContextDevExtractProductsParams { + apiKey: string + domain: string + maxProducts?: number + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevExtractProductsResponse extends ToolResponse { + output: CreditFields & { + products: Array> + } +} + +export interface ContextDevScrapeFontsParams { + apiKey: string + domain: string + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevScrapeFontsResponse extends ToolResponse { + output: CreditFields & { + status: string + domain: string + fonts: Array> + fontLinks: Record + } +} + +export interface ContextDevScrapeStyleguideParams { + apiKey: string + domain: string + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevScrapeStyleguideResponse extends ToolResponse { + output: CreditFields & { + status: string + domain: string + styleguide: Record | null + } +} + export interface ContextDevClassifyNaicsParams { apiKey: string input: string @@ -186,6 +265,14 @@ export interface ContextDevClassifySicResponse extends ToolResponse { } } +/** Shared response shape for every brand-returning endpoint (full brand object). */ +export interface ContextDevBrandResponse extends ToolResponse { + output: CreditFields & { + status: string + brand: Record | null + } +} + export interface ContextDevGetBrandParams { apiKey: string domain: string @@ -195,13 +282,83 @@ export interface ContextDevGetBrandParams { timeoutMS?: number } -export interface ContextDevGetBrandResponse extends ToolResponse { +export interface ContextDevGetBrandByNameParams { + apiKey: string + name: string + countryGl?: string + forceLanguage?: string + maxSpeed?: boolean + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevGetBrandByEmailParams { + apiKey: string + email: string + forceLanguage?: string + maxSpeed?: boolean + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevGetBrandByTickerParams { + apiKey: string + ticker: string + tickerExchange?: string + forceLanguage?: string + maxSpeed?: boolean + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevIdentifyTransactionParams { + apiKey: string + transactionInfo: string + countryGl?: string + city?: string + mcc?: string + phone?: number + highConfidenceOnly?: boolean + forceLanguage?: string + maxSpeed?: boolean + timeoutMS?: number +} + +export interface ContextDevGetBrandSimplifiedParams { + apiKey: string + domain: string + maxAgeMs?: number + timeoutMS?: number +} + +export interface ContextDevGetBrandSimplifiedResponse extends ToolResponse { output: CreditFields & { status: string brand: Record | null } } +export interface ContextDevPrefetchByEmailParams { + apiKey: string + email: string + timeoutMS?: number +} + +export interface ContextDevPrefetchDomainParams { + apiKey: string + domain: string + timeoutMS?: number +} + +/** Shared response shape for the prefetch utility endpoints. */ +export interface ContextDevPrefetchResponse extends ToolResponse { + output: CreditFields & { + status: string + message: string + domain: string + } +} + /** Output schema for a single web search result. */ export const SEARCH_RESULT_OUTPUT_PROPERTIES = { url: { type: 'string', description: 'Result page URL' }, @@ -226,3 +383,82 @@ export const CLASSIFICATION_CODE_OUTPUT_PROPERTIES = { name: { type: 'string', description: 'Industry name' }, confidence: { type: 'string', description: 'Match confidence (high, medium, low)' }, } as const + +/** Output schema for the full brand object returned by brand-intelligence endpoints. */ +export const BRAND_OUTPUT_PROPERTIES = { + domain: { type: 'string', description: 'Brand domain' }, + title: { type: 'string', description: 'Brand title' }, + description: { type: 'string', description: 'Brand description' }, + slogan: { type: 'string', description: 'Brand slogan' }, + colors: { type: 'json', description: 'Brand colors (hex and name)' }, + logos: { type: 'json', description: 'Brand logos with mode, colors, resolution, and type' }, + backdrops: { type: 'json', description: 'Brand backdrop images' }, + socials: { type: 'json', description: 'Social media profiles (type and url)' }, + address: { type: 'json', description: 'Brand address' }, + stock: { type: 'json', description: 'Stock info (ticker and exchange)' }, + is_nsfw: { type: 'boolean', description: 'Whether the brand contains adult content' }, + email: { type: 'string', description: 'Brand contact email' }, + phone: { type: 'string', description: 'Brand contact phone' }, + industries: { type: 'json', description: 'Industry taxonomy (eic industry/subindustry pairs)' }, + links: { type: 'json', description: 'Key brand links (careers, privacy, terms, blog, pricing)' }, + primary_language: { type: 'string', description: 'Primary language of the brand site' }, +} as const + +/** Output schema for the reduced brand object returned by the simplified endpoint. */ +export const SIMPLIFIED_BRAND_OUTPUT_PROPERTIES = { + domain: { type: 'string', description: 'Brand domain' }, + title: { type: 'string', description: 'Brand title' }, + colors: { type: 'json', description: 'Brand colors (hex and name)' }, + logos: { type: 'json', description: 'Brand logos with mode, colors, resolution, and type' }, + backdrops: { type: 'json', description: 'Brand backdrop images' }, +} as const + +/** Output schema for a single extracted product. */ +export const PRODUCT_OUTPUT_PROPERTIES = { + name: { type: 'string', description: 'Product name' }, + description: { type: 'string', description: 'Product description' }, + price: { type: 'number', description: 'Product price' }, + currency: { type: 'string', description: 'Price currency' }, + billing_frequency: { + type: 'string', + description: 'Billing frequency (monthly, yearly, one_time, usage_based)', + }, + pricing_model: { + type: 'string', + description: 'Pricing model (per_seat, flat, tiered, freemium, custom)', + }, + url: { type: 'string', description: 'Product URL' }, + category: { type: 'string', description: 'Product category' }, + features: { type: 'json', description: 'Product features' }, + target_audience: { type: 'json', description: 'Target audience' }, + tags: { type: 'json', description: 'Product tags' }, + image_url: { type: 'string', description: 'Primary product image URL' }, + images: { type: 'json', description: 'Product image URLs' }, + sku: { type: 'string', description: 'Product SKU' }, +} as const + +/** Output schema for a single font usage entry. */ +export const FONT_OUTPUT_PROPERTIES = { + font: { type: 'string', description: 'Font family name' }, + uses: { type: 'json', description: 'Where the font is used' }, + fallbacks: { type: 'json', description: 'Fallback font families' }, + num_elements: { type: 'number', description: 'Number of elements using the font' }, + num_words: { type: 'number', description: 'Number of words rendered in the font' }, + percent_words: { type: 'number', description: 'Percent of words using the font' }, + percent_elements: { type: 'number', description: 'Percent of elements using the font' }, +} as const + +/** Output schema for a single scraped image. */ +export const IMAGE_OUTPUT_PROPERTIES = { + src: { type: 'string', description: 'Image source URL or data' }, + element: { + type: 'string', + description: 'Source element (img, svg, link, source, video, css, object, meta, background)', + }, + type: { type: 'string', description: 'Image representation (url, html, base64)' }, + alt: { type: 'string', description: 'Alt text', optional: true }, + enrichment: { + type: 'json', + description: 'Optional enrichment (width, height, mimetype, url, type) when requested', + }, +} as const diff --git a/apps/sim/tools/context_dev/utils.ts b/apps/sim/tools/context_dev/utils.ts index f6243bef924..b46a28d626a 100644 --- a/apps/sim/tools/context_dev/utils.ts +++ b/apps/sim/tools/context_dev/utils.ts @@ -52,6 +52,23 @@ export function extractCreditMetadata(keyMetadata: ContextDevKeyMetadata | undef } } +/** + * Normalizes a brand-returning Context.dev response into the shared tool output shape. + * Used by every endpoint that returns a `brand` object. + */ +export function transformBrandResponse(data: any): { + status: string + brand: Record | null + creditsConsumed: number | null + creditsRemaining: number | null +} { + return { + status: data.status ?? '', + brand: data.brand ?? null, + ...extractCreditMetadata(data.key_metadata), + } +} + /** * Appends a parameter to a URLSearchParams instance only when it is defined and non-empty. * Booleans are serialized as the literal strings 'true' / 'false'. diff --git a/apps/sim/tools/registry.ts b/apps/sim/tools/registry.ts index 6aec72892a4..a5ffd6ba05d 100644 --- a/apps/sim/tools/registry.ts +++ b/apps/sim/tools/registry.ts @@ -526,11 +526,23 @@ import { contextDevClassifyNaicsTool, contextDevClassifySicTool, contextDevCrawlTool, + contextDevExtractProductsTool, + contextDevExtractProductTool, contextDevExtractTool, + contextDevGetBrandByEmailTool, + contextDevGetBrandByNameTool, + contextDevGetBrandByTickerTool, + contextDevGetBrandSimplifiedTool, contextDevGetBrandTool, + contextDevIdentifyTransactionTool, contextDevMapTool, + contextDevPrefetchByEmailTool, + contextDevPrefetchDomainTool, + contextDevScrapeFontsTool, contextDevScrapeHtmlTool, + contextDevScrapeImagesTool, contextDevScrapeMarkdownTool, + contextDevScrapeStyleguideTool, contextDevScreenshotTool, contextDevSearchTool, } from '@/tools/context_dev' @@ -5505,14 +5517,26 @@ export const tools: Record = { confluence_delete_space_property: confluenceDeleteSpacePropertyTool, context_dev_scrape_markdown: contextDevScrapeMarkdownTool, context_dev_scrape_html: contextDevScrapeHtmlTool, + context_dev_scrape_images: contextDevScrapeImagesTool, context_dev_screenshot: contextDevScreenshotTool, context_dev_crawl: contextDevCrawlTool, context_dev_map: contextDevMapTool, context_dev_search: contextDevSearchTool, context_dev_extract: contextDevExtractTool, + context_dev_extract_product: contextDevExtractProductTool, + context_dev_extract_products: contextDevExtractProductsTool, + context_dev_scrape_fonts: contextDevScrapeFontsTool, + context_dev_scrape_styleguide: contextDevScrapeStyleguideTool, context_dev_classify_naics: contextDevClassifyNaicsTool, context_dev_classify_sic: contextDevClassifySicTool, context_dev_get_brand: contextDevGetBrandTool, + context_dev_get_brand_by_name: contextDevGetBrandByNameTool, + context_dev_get_brand_by_email: contextDevGetBrandByEmailTool, + context_dev_get_brand_by_ticker: contextDevGetBrandByTickerTool, + context_dev_get_brand_simplified: contextDevGetBrandSimplifiedTool, + context_dev_identify_transaction: contextDevIdentifyTransactionTool, + context_dev_prefetch_domain: contextDevPrefetchDomainTool, + context_dev_prefetch_by_email: contextDevPrefetchByEmailTool, cursor_list_agents: cursorListAgentsTool, cursor_list_agents_v2: cursorListAgentsV2Tool, cursor_get_agent: cursorGetAgentTool, From 3ff8e334e2a85ce534e5ba2a2dbe89536ede27de Mon Sep 17 00:00:00 2001 From: waleed Date: Sun, 14 Jun 2026 19:35:05 -0700 Subject: [PATCH 3/3] fix(context-dev): wire includeFrames, split crawl/extract maxPages, derive screenshot MIME Addresses review feedback: - includeFrames is now a block subblock + param for scrape_markdown/scrape_html - crawl and extract use separate Max Pages fields (crawl 1-500, extract 1-50) so a crawl value can no longer be forwarded to extract beyond its limit - screenshot file MIME type and extension are derived from the returned URL instead of being hardcoded to PNG --- apps/sim/blocks/blocks/context_dev.ts | 30 ++++++++++++++++++++--- apps/sim/tools/context_dev/screenshot.ts | 31 ++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/apps/sim/blocks/blocks/context_dev.ts b/apps/sim/blocks/blocks/context_dev.ts index 82a35587ed4..26298c41ef8 100644 --- a/apps/sim/blocks/blocks/context_dev.ts +++ b/apps/sim/blocks/blocks/context_dev.ts @@ -238,6 +238,13 @@ Do not include any explanations, markdown formatting, or other text outside the type: 'switch', condition: { field: 'operation', value: ['scrape_markdown', 'crawl'] }, }, + { + id: 'includeFrames', + title: 'Include Frames', + type: 'switch', + mode: 'advanced', + condition: { field: 'operation', value: ['scrape_markdown', 'scrape_html'] }, + }, { id: 'fullScreenshot', title: 'Full Page Screenshot', @@ -328,9 +335,17 @@ Do not include any explanations, markdown formatting, or other text outside the id: 'maxPages', title: 'Max Pages', type: 'short-input', - placeholder: 'crawl: 100, extract: 5', + placeholder: '100', mode: 'advanced', - condition: { field: 'operation', value: ['crawl', 'extract'] }, + condition: { field: 'operation', value: 'crawl' }, + }, + { + id: 'extractMaxPages', + title: 'Max Pages', + type: 'short-input', + placeholder: '5', + mode: 'advanced', + condition: { field: 'operation', value: 'extract' }, }, { id: 'maxDepth', @@ -560,6 +575,7 @@ Do not include any explanations, markdown formatting, or other text outside the setBool('useMainContentOnly') setBool('includeLinks') setBool('includeImages') + setBool('includeFrames') setNumber('maxAgeMs') setNumber('waitForMs') setNumber('timeoutMS') @@ -567,6 +583,7 @@ Do not include any explanations, markdown formatting, or other text outside the case 'scrape_html': setString('url') setBool('useMainContentOnly') + setBool('includeFrames') setNumber('maxAgeMs') setNumber('waitForMs') setNumber('timeoutMS') @@ -638,7 +655,7 @@ Do not include any explanations, markdown formatting, or other text outside the setString('instructions') setBool('factCheck') setBool('followSubdomains') - setNumber('maxPages') + setNumber('extractMaxPages', 'maxPages') setNumber('maxDepth') setNumber('maxAgeMs') setNumber('stopAfterMs') @@ -755,6 +772,7 @@ Do not include any explanations, markdown formatting, or other text outside the useMainContentOnly: { type: 'boolean', description: 'Return only main content' }, includeLinks: { type: 'boolean', description: 'Preserve hyperlinks' }, includeImages: { type: 'boolean', description: 'Include image references' }, + includeFrames: { type: 'boolean', description: 'Render iframe contents inline' }, fullScreenshot: { type: 'boolean', description: 'Capture the full page' }, handleCookiePopup: { type: 'boolean', description: 'Dismiss cookie banners' }, markdownEnabled: { type: 'boolean', description: 'Scrape search results to markdown' }, @@ -766,7 +784,11 @@ Do not include any explanations, markdown formatting, or other text outside the queryFanout: { type: 'boolean', description: 'Expand query into variants' }, factCheck: { type: 'boolean', description: 'Ground extracted values in page facts' }, followSubdomains: { type: 'boolean', description: 'Follow subdomain links' }, - maxPages: { type: 'number', description: 'Maximum pages to process' }, + maxPages: { type: 'number', description: 'Maximum pages to crawl (1-500)' }, + extractMaxPages: { + type: 'number', + description: 'Maximum pages to analyze for extraction (1-50)', + }, maxDepth: { type: 'number', description: 'Maximum link depth' }, maxProducts: { type: 'number', description: 'Maximum products to extract' }, urlRegex: { type: 'string', description: 'Regex to filter URLs' }, diff --git a/apps/sim/tools/context_dev/screenshot.ts b/apps/sim/tools/context_dev/screenshot.ts index 58e92a1b565..130402a8023 100644 --- a/apps/sim/tools/context_dev/screenshot.ts +++ b/apps/sim/tools/context_dev/screenshot.ts @@ -12,6 +12,32 @@ import { } from '@/tools/context_dev/utils' import type { ToolConfig, ToolFileData } from '@/tools/types' +/** Maps a lowercase image file extension to its MIME type. */ +const IMAGE_MIME_BY_EXTENSION: Record = { + png: 'image/png', + jpg: 'image/jpeg', + jpeg: 'image/jpeg', + webp: 'image/webp', + gif: 'image/gif', + avif: 'image/avif', +} + +/** + * Derives the file extension and MIME type for a stored screenshot from its URL, + * falling back to PNG when the URL has no recognizable image extension. + */ +function screenshotFileMeta(url: string): { extension: string; mimeType: string } { + try { + const ext = new URL(url).pathname.split('.').pop()?.toLowerCase() ?? '' + if (IMAGE_MIME_BY_EXTENSION[ext]) { + return { extension: ext, mimeType: IMAGE_MIME_BY_EXTENSION[ext] } + } + } catch { + // Unparseable URL — fall back to the default below. + } + return { extension: 'png', mimeType: 'image/png' } +} + export const contextDevScreenshotTool: ToolConfig< ContextDevScreenshotParams, ContextDevScreenshotResponse @@ -100,10 +126,11 @@ export const contextDevScreenshotTool: ToolConfig< const screenshotUrl: string = data.screenshot ?? '' const domain: string | null = data.domain ?? null + const { extension, mimeType } = screenshotFileMeta(screenshotUrl) const file: ToolFileData | undefined = screenshotUrl ? { - name: `${domain ?? 'screenshot'}.png`, - mimeType: 'image/png', + name: `${domain ?? 'screenshot'}.${extension}`, + mimeType, url: screenshotUrl, } : undefined