Scrape Markdown

JavaScript

import ContextDev from 'context.dev';

const client = new ContextDev({
  apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
});

const response = await client.web.webScrapeMd({ url: 'https://example.com' });

console.log(response.contentLength);

import os
from context.dev import ContextDev

client = ContextDev(
    api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
)
response = client.web.web_scrape_md(
    url="https://example.com",
)
print(response.content_length)

package main

import (
	"context"
	"fmt"

	"github.com/context-dot-dev/context-go-sdk"
	"github.com/context-dot-dev/context-go-sdk/option"
)

func main() {
	client := contextdev.NewClient(
		option.WithAPIKey("My API Key"),
	)
	response, err := client.Web.WebScrapeMd(context.TODO(), contextdev.WebWebScrapeMdParams{
		URL: "https://example.com",
	})
	if err != nil {
		panic(err.Error())
	}
	fmt.Printf("%+v\n", response.ContentLength)
}

require "context_dev"

context_dev = ContextDev::Client.new(api_key: "My API Key")

response = context_dev.web.web_scrape_md(url: "https://example.com")

puts(response)

<?php

require_once dirname(__DIR__) . '/vendor/autoload.php';

use ContextDev\Client;
use ContextDev\Core\Exceptions\APIException;

$client = new Client(apiKey: getenv('CONTEXT_DEV_API_KEY') ?: 'My API Key');

try {
  $response = $client->web->webScrapeMd(
    url: 'https://example.com',
    actions: [['do' => 'wait', 'timeMs' => 0]],
    country: 'de',
    excludeSelectors: ['x'],
    headers: ['foo' => 'J!'],
    includeFrames: 'true',
    includeImages: 'true',
    includeLinks: 'true',
    includeSelectors: ['x'],
    maxAgeMs: 0,
    pdf: ['end' => 1, 'ocr' => 'true', 'shouldParse' => 'true', 'start' => 1],
    settleAnimations: 'true',
    shortenBase64Images: 'true',
    tags: ['production', 'team-alpha'],
    timeoutMs: 1,
    useMainContentOnly: 'true',
    waitForMs: 0,
    zdr: 'enabled',
  );

  var_dump($response);
} catch (APIException $e) {
  echo $e->getMessage();
}

context-dev web web-scrape-md \
  --api-key 'My API Key' \
  --url https://example.com

curl --request GET \
  --url https://api.context.dev/v1/web/scrape/markdown \
  --header 'Authorization: Bearer <token>'

HttpResponse<String> response = Unirest.get("https://api.context.dev/v1/web/scrape/markdown")
  .header("Authorization", "Bearer <token>")
  .asString();

{
  "success": true,
  "markdown": "<string>",
  "contentLength": 1,
  "url": "<string>",
  "metadata": {
    "sourceUrl": "<string>",
    "finalUrl": "<string>",
    "title": "<string>",
    "description": "<string>",
    "language": "<string>",
    "keywords": [
      "<string>"
    ],
    "canonicalUrl": "<string>",
    "author": "<string>",
    "siteName": "<string>",
    "image": "<string>",
    "favicon": "<string>",
    "publishedTime": "<string>",
    "modifiedTime": "<string>",
    "robots": "<string>",
    "openGraph": {},
    "twitter": {},
    "alternates": [
      {
        "href": "<string>",
        "hreflang": "<string>",
        "type": "<string>",
        "title": "<string>"
      }
    ],
    "jsonLd": [
      {}
    ],
    "additionalMeta": {}
  },
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNAUTHORIZED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "NOT_FOUND",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "REQUEST_TIMEOUT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNSUPPORTED_CONTENT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "RATE_LIMITED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "INTERNAL_ERROR",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

GET

web

scrape

markdown

JavaScript

import ContextDev from 'context.dev';

const client = new ContextDev({
  apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
});

const response = await client.web.webScrapeMd({ url: 'https://example.com' });

console.log(response.contentLength);

import os
from context.dev import ContextDev

client = ContextDev(
    api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
)
response = client.web.web_scrape_md(
    url="https://example.com",
)
print(response.content_length)

package main

import (
	"context"
	"fmt"

	"github.com/context-dot-dev/context-go-sdk"
	"github.com/context-dot-dev/context-go-sdk/option"
)

func main() {
	client := contextdev.NewClient(
		option.WithAPIKey("My API Key"),
	)
	response, err := client.Web.WebScrapeMd(context.TODO(), contextdev.WebWebScrapeMdParams{
		URL: "https://example.com",
	})
	if err != nil {
		panic(err.Error())
	}
	fmt.Printf("%+v\n", response.ContentLength)
}

require "context_dev"

context_dev = ContextDev::Client.new(api_key: "My API Key")

response = context_dev.web.web_scrape_md(url: "https://example.com")

puts(response)

<?php

require_once dirname(__DIR__) . '/vendor/autoload.php';

use ContextDev\Client;
use ContextDev\Core\Exceptions\APIException;

$client = new Client(apiKey: getenv('CONTEXT_DEV_API_KEY') ?: 'My API Key');

try {
  $response = $client->web->webScrapeMd(
    url: 'https://example.com',
    actions: [['do' => 'wait', 'timeMs' => 0]],
    country: 'de',
    excludeSelectors: ['x'],
    headers: ['foo' => 'J!'],
    includeFrames: 'true',
    includeImages: 'true',
    includeLinks: 'true',
    includeSelectors: ['x'],
    maxAgeMs: 0,
    pdf: ['end' => 1, 'ocr' => 'true', 'shouldParse' => 'true', 'start' => 1],
    settleAnimations: 'true',
    shortenBase64Images: 'true',
    tags: ['production', 'team-alpha'],
    timeoutMs: 1,
    useMainContentOnly: 'true',
    waitForMs: 0,
    zdr: 'enabled',
  );

  var_dump($response);
} catch (APIException $e) {
  echo $e->getMessage();
}

context-dev web web-scrape-md \
  --api-key 'My API Key' \
  --url https://example.com

curl --request GET \
  --url https://api.context.dev/v1/web/scrape/markdown \
  --header 'Authorization: Bearer <token>'

HttpResponse<String> response = Unirest.get("https://api.context.dev/v1/web/scrape/markdown")
  .header("Authorization", "Bearer <token>")
  .asString();

{
  "success": true,
  "markdown": "<string>",
  "contentLength": 1,
  "url": "<string>",
  "metadata": {
    "sourceUrl": "<string>",
    "finalUrl": "<string>",
    "title": "<string>",
    "description": "<string>",
    "language": "<string>",
    "keywords": [
      "<string>"
    ],
    "canonicalUrl": "<string>",
    "author": "<string>",
    "siteName": "<string>",
    "image": "<string>",
    "favicon": "<string>",
    "publishedTime": "<string>",
    "modifiedTime": "<string>",
    "robots": "<string>",
    "openGraph": {},
    "twitter": {},
    "alternates": [
      {
        "href": "<string>",
        "hreflang": "<string>",
        "type": "<string>",
        "title": "<string>"
      }
    ],
    "jsonLd": [
      {}
    ],
    "additionalMeta": {}
  },
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNAUTHORIZED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "NOT_FOUND",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "REQUEST_TIMEOUT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "UNSUPPORTED_CONTENT",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "RATE_LIMITED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "INTERNAL_ERROR",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

1 Credit With actions: 2 Credits

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Query Parameters

url

string<uri>

required

Full URL to scrape into LLM usable Markdown (must include http:// or https:// protocol)

Minimum string length: 1

includeLinks

default:true

Preserve hyperlinks in Markdown output

includeImages

default:false

Include image references in Markdown output

shortenBase64Images

default:true

Shorten base64-encoded image data in the Markdown output

useMainContentOnly

default:false

Extract only the main content of the page, excluding headers, footers, sidebars, and navigation

pdf

object

PDF parsing controls. Use start/end to limit text extraction and embedded-image detection/OCR to an inclusive 1-based page range.

Show child attributes

includeFrames

default:false

When true, the contents of iframes are rendered to Markdown.

includeSelectors

string[] | null

CSS selectors. When provided, only matching HTML subtrees (and their descendants) are kept before conversion to Markdown. When omitted, the entire document is kept. Examples: "article.main", "#content", "[role=main]".

Maximum array length: 50

Required string length: 1 - 2048

excludeSelectors

string[] | null

CSS selectors to remove before conversion to Markdown. Applied after includeSelectors. Exclusion takes precedence: an element matching both is removed. Examples: "nav", "footer", ".ad-banner", "[aria-hidden=true]".

Maximum array length: 50

Required string length: 1 - 2048

maxAgeMs

integer | null

default:86400000

Return a cached result if a prior scrape for the same parameters exists and is younger than this many milliseconds. Defaults to 1 day (86400000 ms) when omitted. Max is 30 days (2592000000 ms). Set to 0 to always scrape fresh.

Required range: 0 <= x <= 2592000000

waitForMs

integer | null

Optional browser wait time in milliseconds after initial page load before converting the page to Markdown. Min: 0. Max: 30000 (30 seconds).

Required range: 0 <= x <= 30000

settleAnimations

default:false

When true, waits briefly for CSS and transition animations to settle before converting to Markdown. Defaults to false. This adds a bit of latency in exchange for more stable output on animated pages.

actions

(Wait · object | Perform · object)[] | null

Optional browser actions executed in array order after the page loads and before content is captured. Requires a paid plan. Send a JSON array in the query parameter. Maximum: 5 actions.

Maximum array length: 5

Browser action discriminated by do. Each variant exposes only its applicable fields.

Wait
Perform

Show child attributes

headers

object

Optional outbound HTTP headers forwarded only to the target URL, sent as deep-object query params such as headers[X-Custom]=value. When provided, caching is bypassed: the result is neither read from nor written to cache.

Show child attributes

country

enum<string>

Two-letter ISO 3166-1 alpha-2 country code identifying a supported Context.dev residential proxy exit location. Must be one of Context.dev's supported countries. When provided, Context.dev fetches the target page from that country.

Available options:

ad,

ae,

af,

ag,

ai,

al,

am,

ao,

ar,

at,

au,

aw,

az,

ba,

bb,

bd,

be,

bf,

bg,

bh,

bi,

bj,

bm,

bn,

bo,

bq,

br,

bs,

bw,

by,

bz,

ca,

cd,

cf,

cg,

ch,

ci,

cl,

cm,

cn,

co,

cr,

cv,

cw,

cy,

cz,

de,

dj,

dk,

dm,

do,

dz,

ec,

ee,

eg,

es,

et,

fi,

fj,

fr,

ga,

gb,

gd,

ge,

gf,

gg,

gh,

gm,

gn,

gp,

gq,

gr,

gt,

gu,

gw,

gy,

hk,

hn,

hr,

ht,

hu,

id,

ie,

il,

im,

in,

iq,

ir,

is,

it,

je,

jm,

jo,

jp,

ke,

kg,

kh,

kn,

kr,

kw,

ky,

kz,

la,

lb,

lc,

lk,

lr,

ls,

lt,

lu,

lv,

ly,

ma,

mc,

md,

me,

mf,

mg,

mk,

ml,

mm,

mn,

mo,

mq,

mr,

mt,

mu,

mv,

mw,

mx,

my,

mz,

na,

nc,

ne,

ng,

ni,

nl,

no,

np,

nz,

om,

pa,

pe,

pf,

pg,

ph,

pk,

pl,

pr,

ps,

pt,

py,

qa,

re,

ro,

rs,

ru,

rw,

sa,

sc,

sd,

se,

sg,

si,

sk,

sl,

sm,

sn,

so,

sr,

ss,

st,

sv,

sx,

sy,

sz,

tc,

td,

tg,

th,

tj,

tl,

tm,

tn,

tr,

tt,

tw,

tz,

ua,

ug,

us,

uy,

uz,

vc,

ve,

vg,

vi,

vn,

ye,

yt,

za,

zm,

zw

Example:

"de"

timeoutMS

integer

Optional timeout in milliseconds for the request. If the request takes longer than this value, it will be aborted with a 408 status code. Maximum allowed value is 300000ms (5 minutes).

Required range: 1 <= x <= 300000

zdr

enum<string>

default:disabled

Set to enabled to bypass shared caches and omit request and response content from retained usage logs. Requires zero data retention to be enabled for your organization (contact [email protected]), otherwise the request fails with ZDR_NOT_ENABLED. Successful ZDR responses include X-Context-ZDR: true.

Available options:

enabled,

disabled

Response

Successful response

success

enum<boolean>

required

Indicates success

Available options:

true

markdown

string

required

Page content converted to GitHub Flavored Markdown

contentLength

integer

required

UTF-8 byte length of the returned Markdown. Use 0 to identify an empty result and compare small values against your workload's minimum useful-content threshold.

Required range: x >= 0

url

string

required

The URL that was scraped

metadata

object

required

Metadata extracted from the scraped page HTML.

Show child attributes

key_metadata

object

Metadata about the API key used for the request. Included in every response whenever a valid API key is provided, even when the response status is not 200.

Show child attributes

Scrape HTML

⌘I

Web Extraction

Brand Intelligence

Entity Enrichment

Monitor Infrastructure

Utility

Authorizations

Query Parameters

Response