Crawl Website Content

JavaScript

import ContextDev from 'context.dev';

const client = new ContextDev({
  apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
});

const response = await client.web.webCrawlMd({ url: 'https://example.com' });

console.log(response.metadata);

import os
from context.dev import ContextDev

client = ContextDev(
    api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
)
response = client.web.web_crawl_md(
    url="https://example.com",
)
print(response.metadata)

package main

import (
	"context"
	"fmt"

	"github.com/context-dot-dev/context-go-sdk"
	"github.com/context-dot-dev/context-go-sdk/option"
)

func main() {
	client := contextdev.NewClient(
		option.WithAPIKey("My API Key"),
	)
	response, err := client.Web.WebCrawlMd(context.TODO(), contextdev.WebWebCrawlMdParams{
		URL: "https://example.com",
	})
	if err != nil {
		panic(err.Error())
	}
	fmt.Printf("%+v\n", response.Metadata)
}

require "context_dev"

context_dev = ContextDev::Client.new(api_key: "My API Key")

response = context_dev.web.web_crawl_md(url: "https://example.com")

puts(response)

<?php

require_once dirname(__DIR__) . '/vendor/autoload.php';

use ContextDev\Client;
use ContextDev\Core\Exceptions\APIException;

$client = new Client(apiKey: getenv('CONTEXT_DEV_API_KEY') ?: 'My API Key');

try {
  $response = $client->web->webCrawlMd(
    url: 'https://example.com',
    country: 'de',
    excludeSelectors: ['string'],
    followSubdomains: true,
    includeFrames: true,
    includeImages: true,
    includeLinks: true,
    includeSelectors: ['string'],
    maxAgeMs: 0,
    maxDepth: 0,
    maxPages: 1,
    pdf: ['end' => 1, 'ocr' => true, 'shouldParse' => true, 'start' => 1],
    settleAnimations: true,
    shortenBase64Images: true,
    stopAfterMs: 10000,
    tags: ['production', 'team-alpha'],
    timeoutMs: 1000,
    urlRegex: '^https?://[^/]+/blog/',
    useMainContentOnly: true,
    waitForMs: 0,
    zdr: 'enabled',
  );

  var_dump($response);
} catch (APIException $e) {
  echo $e->getMessage();
}

context-dev web web-crawl-md \
  --api-key 'My API Key' \
  --url https://example.com

curl --request POST \
  --url https://api.context.dev/v1/web/crawl \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url": "<string>",
  "maxPages": 100,
  "maxDepth": 1,
  "urlRegex": "^https?://[^/]+/blog/",
  "includeLinks": true,
  "includeImages": false,
  "shortenBase64Images": true,
  "useMainContentOnly": false,
  "followSubdomains": false,
  "pdf": {
    "shouldParse": true,
    "ocr": false
  },
  "includeFrames": false,
  "includeSelectors": [
    "<string>"
  ],
  "excludeSelectors": [
    "<string>"
  ],
  "maxAgeMs": 86400000,
  "waitForMs": 15000,
  "settleAnimations": false,
  "stopAfterMs": 80000,
  "country": "de",
  "timeoutMS": 150500,
  "zdr": "disabled",
  "tags": [
    "production",
    "team-alpha"
  ]
}
'

HttpResponse<String> response = Unirest.post("https://api.context.dev/v1/web/crawl")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url\": \"<string>\",\n  \"maxPages\": 100,\n  \"maxDepth\": 1,\n  \"urlRegex\": \"^https?://[^/]+/blog/\",\n  \"includeLinks\": true,\n  \"includeImages\": false,\n  \"shortenBase64Images\": true,\n  \"useMainContentOnly\": false,\n  \"followSubdomains\": false,\n  \"pdf\": {\n    \"shouldParse\": true,\n    \"ocr\": false\n  },\n  \"includeFrames\": false,\n  \"includeSelectors\": [\n    \"<string>\"\n  ],\n  \"excludeSelectors\": [\n    \"<string>\"\n  ],\n  \"maxAgeMs\": 86400000,\n  \"waitForMs\": 15000,\n  \"settleAnimations\": false,\n  \"stopAfterMs\": 80000,\n  \"country\": \"de\",\n  \"timeoutMS\": 150500,\n  \"zdr\": \"disabled\",\n  \"tags\": [\n    \"production\",\n    \"team-alpha\"\n  ]\n}")
  .asString();

{
  "results": [
    {
      "markdown": "<string>",
      "metadata": {
        "sourceUrl": "<string>",
        "finalUrl": "<string>",
        "title": "<string>",
        "url": "<string>",
        "crawlDepth": 123,
        "statusCode": 123,
        "success": true,
        "description": "<string>",
        "language": "<string>",
        "keywords": [
          "<string>"
        ],
        "canonicalUrl": "<string>",
        "author": "<string>",
        "siteName": "<string>",
        "image": "<string>",
        "favicon": "<string>",
        "publishedTime": "<string>",
        "modifiedTime": "<string>",
        "robots": "<string>",
        "openGraph": {},
        "twitter": {},
        "alternates": [
          {
            "href": "<string>",
            "hreflang": "<string>",
            "type": "<string>",
            "title": "<string>"
          }
        ],
        "jsonLd": [
          {}
        ],
        "additionalMeta": {}
      }
    }
  ],
  "metadata": {
    "numUrls": 123,
    "maxCrawlDepth": 123,
    "numSucceeded": 123,
    "numFailed": 123,
    "numSkipped": 123
  },
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNAUTHORIZED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "NOT_FOUND",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "REQUEST_TIMEOUT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNSUPPORTED_CONTENT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "RATE_LIMITED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "INTERNAL_ERROR",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

POST

web

crawl

JavaScript

import ContextDev from 'context.dev';

const client = new ContextDev({
  apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
});

const response = await client.web.webCrawlMd({ url: 'https://example.com' });

console.log(response.metadata);

import os
from context.dev import ContextDev

client = ContextDev(
    api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
)
response = client.web.web_crawl_md(
    url="https://example.com",
)
print(response.metadata)

package main

import (
	"context"
	"fmt"

	"github.com/context-dot-dev/context-go-sdk"
	"github.com/context-dot-dev/context-go-sdk/option"
)

func main() {
	client := contextdev.NewClient(
		option.WithAPIKey("My API Key"),
	)
	response, err := client.Web.WebCrawlMd(context.TODO(), contextdev.WebWebCrawlMdParams{
		URL: "https://example.com",
	})
	if err != nil {
		panic(err.Error())
	}
	fmt.Printf("%+v\n", response.Metadata)
}

require "context_dev"

context_dev = ContextDev::Client.new(api_key: "My API Key")

response = context_dev.web.web_crawl_md(url: "https://example.com")

puts(response)

<?php

require_once dirname(__DIR__) . '/vendor/autoload.php';

use ContextDev\Client;
use ContextDev\Core\Exceptions\APIException;

$client = new Client(apiKey: getenv('CONTEXT_DEV_API_KEY') ?: 'My API Key');

try {
  $response = $client->web->webCrawlMd(
    url: 'https://example.com',
    country: 'de',
    excludeSelectors: ['string'],
    followSubdomains: true,
    includeFrames: true,
    includeImages: true,
    includeLinks: true,
    includeSelectors: ['string'],
    maxAgeMs: 0,
    maxDepth: 0,
    maxPages: 1,
    pdf: ['end' => 1, 'ocr' => true, 'shouldParse' => true, 'start' => 1],
    settleAnimations: true,
    shortenBase64Images: true,
    stopAfterMs: 10000,
    tags: ['production', 'team-alpha'],
    timeoutMs: 1000,
    urlRegex: '^https?://[^/]+/blog/',
    useMainContentOnly: true,
    waitForMs: 0,
    zdr: 'enabled',
  );

  var_dump($response);
} catch (APIException $e) {
  echo $e->getMessage();
}

context-dev web web-crawl-md \
  --api-key 'My API Key' \
  --url https://example.com

curl --request POST \
  --url https://api.context.dev/v1/web/crawl \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data '
{
  "url": "<string>",
  "maxPages": 100,
  "maxDepth": 1,
  "urlRegex": "^https?://[^/]+/blog/",
  "includeLinks": true,
  "includeImages": false,
  "shortenBase64Images": true,
  "useMainContentOnly": false,
  "followSubdomains": false,
  "pdf": {
    "shouldParse": true,
    "ocr": false
  },
  "includeFrames": false,
  "includeSelectors": [
    "<string>"
  ],
  "excludeSelectors": [
    "<string>"
  ],
  "maxAgeMs": 86400000,
  "waitForMs": 15000,
  "settleAnimations": false,
  "stopAfterMs": 80000,
  "country": "de",
  "timeoutMS": 150500,
  "zdr": "disabled",
  "tags": [
    "production",
    "team-alpha"
  ]
}
'

HttpResponse<String> response = Unirest.post("https://api.context.dev/v1/web/crawl")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url\": \"<string>\",\n  \"maxPages\": 100,\n  \"maxDepth\": 1,\n  \"urlRegex\": \"^https?://[^/]+/blog/\",\n  \"includeLinks\": true,\n  \"includeImages\": false,\n  \"shortenBase64Images\": true,\n  \"useMainContentOnly\": false,\n  \"followSubdomains\": false,\n  \"pdf\": {\n    \"shouldParse\": true,\n    \"ocr\": false\n  },\n  \"includeFrames\": false,\n  \"includeSelectors\": [\n    \"<string>\"\n  ],\n  \"excludeSelectors\": [\n    \"<string>\"\n  ],\n  \"maxAgeMs\": 86400000,\n  \"waitForMs\": 15000,\n  \"settleAnimations\": false,\n  \"stopAfterMs\": 80000,\n  \"country\": \"de\",\n  \"timeoutMS\": 150500,\n  \"zdr\": \"disabled\",\n  \"tags\": [\n    \"production\",\n    \"team-alpha\"\n  ]\n}")
  .asString();

{
  "results": [
    {
      "markdown": "<string>",
      "metadata": {
        "sourceUrl": "<string>",
        "finalUrl": "<string>",
        "title": "<string>",
        "url": "<string>",
        "crawlDepth": 123,
        "statusCode": 123,
        "success": true,
        "description": "<string>",
        "language": "<string>",
        "keywords": [
          "<string>"
        ],
        "canonicalUrl": "<string>",
        "author": "<string>",
        "siteName": "<string>",
        "image": "<string>",
        "favicon": "<string>",
        "publishedTime": "<string>",
        "modifiedTime": "<string>",
        "robots": "<string>",
        "openGraph": {},
        "twitter": {},
        "alternates": [
          {
            "href": "<string>",
            "hreflang": "<string>",
            "type": "<string>",
            "title": "<string>"
          }
        ],
        "jsonLd": [
          {}
        ],
        "additionalMeta": {}
      }
    }
  ],
  "metadata": {
    "numUrls": 123,
    "maxCrawlDepth": 123,
    "numSucceeded": 123,
    "numFailed": 123,
    "numSkipped": 123
  },
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNAUTHORIZED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "NOT_FOUND",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "REQUEST_TIMEOUT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNSUPPORTED_CONTENT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "RATE_LIMITED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "INTERNAL_ERROR",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

1 Credit Per Page Rate limit weight: 10

Each call counts as 10 requests against your per-minute rate limit.

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Body

application/json

url

string<uri>

required

The starting URL for the crawl (must include http:// or https:// protocol)

maxPages

integer

default:100

Maximum number of pages to crawl. Hard cap: 500.

Required range: 1 <= x <= 500

maxDepth

integer

Maximum link depth from the starting URL (0 = only the starting page)

Required range: x >= 0

urlRegex

string

Regex pattern. Only URLs matching this pattern will be followed and scraped.

Example:

"^https?://[^/]+/blog/"

includeLinks

boolean

default:true

Preserve hyperlinks in the Markdown output

includeImages

boolean

default:false

Include image references in the Markdown output

shortenBase64Images

boolean

default:true

Truncate base64-encoded image data in the Markdown output

useMainContentOnly

boolean

default:false

Extract only the main content, stripping headers, footers, sidebars, and navigation

followSubdomains

boolean

default:false

When true, follow links on subdomains of the starting URL's domain (e.g. docs.example.com when starting from example.com). www and apex are always treated as equivalent.

pdf

object

PDF parsing controls. Use start/end to limit text extraction and embedded-image detection/OCR to an inclusive 1-based page range.

Show child attributes

includeFrames

boolean

default:false

When true, the contents of iframes are rendered to Markdown for each crawled page.

includeSelectors

string[]

CSS selectors. When provided, only matching HTML subtrees (and their descendants) are kept before each crawled page is converted to Markdown. When omitted, the entire document is kept. Examples: "article.main", "#content", "[role=main]".

Maximum array length: 50

Maximum string length: 2048

excludeSelectors

string[]

CSS selectors to remove before each crawled page is converted to Markdown. Applied after includeSelectors. Exclusion takes precedence: an element matching both is removed. Examples: "nav", "footer", ".ad-banner", "[aria-hidden=true]".

Maximum array length: 50

Maximum string length: 2048

maxAgeMs

integer

default:86400000

Return a cached result if a prior scrape for the same parameters exists and is younger than this many milliseconds. Defaults to 1 day (86400000 ms) when omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.

Required range: 0 <= x <= 2592000000

waitForMs

integer

Optional browser wait time in milliseconds after initial page load for each crawled page. Min: 0. Max: 30000 (30 seconds).

Required range: 0 <= x <= 30000

settleAnimations

boolean

default:false

When true, waits briefly for CSS and transition animations to settle before extracting each crawled page. Defaults to false. This adds a bit of latency in exchange for more stable output on animated pages.

stopAfterMs

integer

default:80000

Soft time budget for the crawl in milliseconds. After each scrape, the crawler checks the elapsed time and, if exceeded, returns the pages collected so far instead of continuing. Min: 10000 (10s). Max: 110000 (110s). Default: 80000 (80s).

Required range: 10000 <= x <= 110000

country

enum<string>

Two-letter ISO 3166-1 alpha-2 country code identifying a supported Context.dev residential proxy exit location. Must be one of Context.dev's supported countries. When provided, Context.dev fetches the target page from that country.

Available options:

ad,

ae,

af,

ag,

ai,

al,

am,

ao,

ar,

at,

au,

aw,

az,

ba,

bb,

bd,

be,

bf,

bg,

bh,

bi,

bj,

bm,

bn,

bo,

bq,

br,

bs,

bw,

by,

bz,

ca,

cd,

cf,

cg,

ch,

ci,

cl,

cm,

cn,

co,

cr,

cv,

cw,

cy,

cz,

de,

dj,

dk,

dm,

do,

dz,

ec,

ee,

eg,

es,

et,

fi,

fj,

fr,

ga,

gb,

gd,

ge,

gf,

gg,

gh,

gm,

gn,

gp,

gq,

gr,

gt,

gu,

gw,

gy,

hk,

hn,

hr,

ht,

hu,

id,

ie,

il,

im,

in,

iq,

ir,

is,

it,

je,

jm,

jo,

jp,

ke,

kg,

kh,

kn,

kr,

kw,

ky,

kz,

la,

lb,

lc,

lk,

lr,

ls,

lt,

lu,

lv,

ly,

ma,

mc,

md,

me,

mf,

mg,

mk,

ml,

mm,

mn,

mo,

mq,

mr,

mt,

mu,

mv,

mw,

mx,

my,

mz,

na,

nc,

ne,

ng,

ni,

nl,

no,

np,

nz,

om,

pa,

pe,

pf,

pg,

ph,

pk,

pl,

pr,

ps,

pt,

py,

qa,

re,

ro,

rs,

ru,

rw,

sa,

sc,

sd,

se,

sg,

si,

sk,

sl,

sm,

sn,

so,

sr,

ss,

st,

sv,

sx,

sy,

sz,

tc,

td,

tg,

th,

tj,

tl,

tm,

tn,

tr,

tt,

tw,

tz,

ua,

ug,

us,

uy,

uz,

vc,

ve,

vg,

vi,

vn,

ye,

yt,

za,

zm,

zw

Example:

"de"

timeoutMS

integer

Optional timeout in milliseconds for the request. If the request takes longer than this value, it will be aborted with a 408 status code. Maximum allowed value is 300000ms (5 minutes).

Required range: 1000 <= x <= 300000

zdr

enum<string>

default:disabled

Set to enabled to bypass shared caches and omit request and response content from retained usage logs. Requires zero data retention to be enabled for your organization (contact [email protected]), otherwise the request fails with ZDR_NOT_ENABLED. Successful ZDR responses include X-Context-ZDR: true.

Available options:

enabled,

disabled

Response

Successful response

results

object[]

required

Show child attributes

metadata

object

required

Show child attributes

key_metadata

object

Metadata about the API key used for the request. Included in every response whenever a valid API key is provided, even when the response status is not 200.

Show child attributes

Crawl Sitemap

Search the Web

⌘I

Web Extraction

Brand Intelligence

Entity Enrichment

Monitor Infrastructure

Utility

Crawl Website Content

Authorizations

Body

Response