Extract Structured Data

JavaScript

import ContextDev from 'context.dev';

const client = new ContextDev({
  apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
});

const response = await client.web.extract({
  schema: {
    type: 'bar',
    properties: 'bar',
    required: 'bar',
    additionalProperties: 'bar',
  },
  url: 'https://example.com',
});

console.log(response.data);

import os
from context.dev import ContextDev

client = ContextDev(
    api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
)
response = client.web.extract(
    schema={
        "type": "bar",
        "properties": "bar",
        "required": "bar",
        "additionalProperties": "bar",
    },
    url="https://example.com",
)
print(response.data)

package main

import (
	"context"
	"fmt"

	"github.com/context-dot-dev/context-go-sdk"
	"github.com/context-dot-dev/context-go-sdk/option"
)

func main() {
	client := contextdev.NewClient(
		option.WithAPIKey("My API Key"),
	)
	response, err := client.Web.Extract(context.TODO(), contextdev.WebExtractParams{
		Schema: map[string]any{
			"type":                 "bar",
			"properties":           "bar",
			"required":             "bar",
			"additionalProperties": "bar",
		},
		URL: "https://example.com",
	})
	if err != nil {
		panic(err.Error())
	}
	fmt.Printf("%+v\n", response.Data)
}

require "context_dev"

context_dev = ContextDev::Client.new(api_key: "My API Key")

response = context_dev.web.extract(
  schema: {type: "bar", properties: "bar", required: "bar", additionalProperties: "bar"},
  url: "https://example.com"
)

puts(response)

<?php

require_once dirname(__DIR__) . '/vendor/autoload.php';

use ContextDev\Client;
use ContextDev\Core\Exceptions\APIException;

$client = new Client(apiKey: getenv('CONTEXT_DEV_API_KEY') ?: 'My API Key');

try {
  $response = $client->web->extract(
    schema: [
      'type' => 'bar',
      'properties' => 'bar',
      'required' => 'bar',
      'additionalProperties' => 'bar',
    ],
    url: 'https://example.com',
    factCheck: true,
    followSubdomains: true,
    includeFrames: true,
    instructions: 'instructions',
    maxAgeMs: 0,
    maxDepth: 0,
    maxPages: 1,
    pdf: ['end' => 1, 'shouldParse' => true, 'start' => 1],
    settleAnimations: true,
    stopAfterMs: 10000,
    tags: ['production', 'team-alpha'],
    timeoutMs: 1000,
    waitForMs: 0,
  );

  var_dump($response);
} catch (APIException $e) {
  echo $e->getMessage();
}

context-dev web extract \
  --api-key 'My API Key' \
  --schema '{type: bar, properties: bar, required: bar, additionalProperties: bar}' \
  --url https://example.com

curl --request POST \
  --url https://api.context.dev/v1/web/extract \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data @- <<EOF
{
  "url": "<string>",
  "schema": {
    "type": "object",
    "properties": {
      "mission_statement": {
        "type": "string",
        "description": "The company's stated mission."
      },
      "case_studies": {
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "title": {
              "type": "string"
            },
            "url": {
              "type": "string"
            }
          },
          "required": [
            "title",
            "url"
          ],
          "additionalProperties": false
        }
      }
    },
    "required": [
      "mission_statement",
      "case_studies"
    ],
    "additionalProperties": false
  },
  "instructions": "<string>",
  "factCheck": false,
  "followSubdomains": false,
  "maxPages": 5,
  "maxDepth": 1,
  "pdf": {
    "shouldParse": true
  },
  "includeFrames": false,
  "maxAgeMs": 604800000,
  "waitForMs": 15000,
  "settleAnimations": false,
  "stopAfterMs": 80000,
  "timeoutMS": 150500,
  "tags": [
    "production",
    "team-alpha"
  ]
}
EOF

HttpResponse<String> response = Unirest.post("https://api.context.dev/v1/web/extract")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url\": \"<string>\",\n  \"schema\": {\n    \"type\": \"object\",\n    \"properties\": {\n      \"mission_statement\": {\n        \"type\": \"string\",\n        \"description\": \"The company's stated mission.\"\n      },\n      \"case_studies\": {\n        \"type\": \"array\",\n        \"items\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"title\": {\n              \"type\": \"string\"\n            },\n            \"url\": {\n              \"type\": \"string\"\n            }\n          },\n          \"required\": [\n            \"title\",\n            \"url\"\n          ],\n          \"additionalProperties\": false\n        }\n      }\n    },\n    \"required\": [\n      \"mission_statement\",\n      \"case_studies\"\n    ],\n    \"additionalProperties\": false\n  },\n  \"instructions\": \"<string>\",\n  \"factCheck\": false,\n  \"followSubdomains\": false,\n  \"maxPages\": 5,\n  \"maxDepth\": 1,\n  \"pdf\": {\n    \"shouldParse\": true\n  },\n  \"includeFrames\": false,\n  \"maxAgeMs\": 604800000,\n  \"waitForMs\": 15000,\n  \"settleAnimations\": false,\n  \"stopAfterMs\": 80000,\n  \"timeoutMS\": 150500,\n  \"tags\": [\n    \"production\",\n    \"team-alpha\"\n  ]\n}")
  .asString();

{
  "status": "<string>",
  "url": "<string>",
  "urls_analyzed": [
    "<string>"
  ],
  "data": {},
  "metadata": {
    "numUrls": 123,
    "maxCrawlDepth": 123,
    "numSucceeded": 123,
    "numFailed": 123,
    "numSkipped": 123,
    "numBlocked": 123
  },
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "RATE_LIMITED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

POST

web

extract

JavaScript

import ContextDev from 'context.dev';

const client = new ContextDev({
  apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
});

const response = await client.web.extract({
  schema: {
    type: 'bar',
    properties: 'bar',
    required: 'bar',
    additionalProperties: 'bar',
  },
  url: 'https://example.com',
});

console.log(response.data);

import os
from context.dev import ContextDev

client = ContextDev(
    api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
)
response = client.web.extract(
    schema={
        "type": "bar",
        "properties": "bar",
        "required": "bar",
        "additionalProperties": "bar",
    },
    url="https://example.com",
)
print(response.data)

package main

import (
	"context"
	"fmt"

	"github.com/context-dot-dev/context-go-sdk"
	"github.com/context-dot-dev/context-go-sdk/option"
)

func main() {
	client := contextdev.NewClient(
		option.WithAPIKey("My API Key"),
	)
	response, err := client.Web.Extract(context.TODO(), contextdev.WebExtractParams{
		Schema: map[string]any{
			"type":                 "bar",
			"properties":           "bar",
			"required":             "bar",
			"additionalProperties": "bar",
		},
		URL: "https://example.com",
	})
	if err != nil {
		panic(err.Error())
	}
	fmt.Printf("%+v\n", response.Data)
}

require "context_dev"

context_dev = ContextDev::Client.new(api_key: "My API Key")

response = context_dev.web.extract(
  schema: {type: "bar", properties: "bar", required: "bar", additionalProperties: "bar"},
  url: "https://example.com"
)

puts(response)

<?php

require_once dirname(__DIR__) . '/vendor/autoload.php';

use ContextDev\Client;
use ContextDev\Core\Exceptions\APIException;

$client = new Client(apiKey: getenv('CONTEXT_DEV_API_KEY') ?: 'My API Key');

try {
  $response = $client->web->extract(
    schema: [
      'type' => 'bar',
      'properties' => 'bar',
      'required' => 'bar',
      'additionalProperties' => 'bar',
    ],
    url: 'https://example.com',
    factCheck: true,
    followSubdomains: true,
    includeFrames: true,
    instructions: 'instructions',
    maxAgeMs: 0,
    maxDepth: 0,
    maxPages: 1,
    pdf: ['end' => 1, 'shouldParse' => true, 'start' => 1],
    settleAnimations: true,
    stopAfterMs: 10000,
    tags: ['production', 'team-alpha'],
    timeoutMs: 1000,
    waitForMs: 0,
  );

  var_dump($response);
} catch (APIException $e) {
  echo $e->getMessage();
}

context-dev web extract \
  --api-key 'My API Key' \
  --schema '{type: bar, properties: bar, required: bar, additionalProperties: bar}' \
  --url https://example.com

curl --request POST \
  --url https://api.context.dev/v1/web/extract \
  --header 'Authorization: Bearer <token>' \
  --header 'Content-Type: application/json' \
  --data @- <<EOF
{
  "url": "<string>",
  "schema": {
    "type": "object",
    "properties": {
      "mission_statement": {
        "type": "string",
        "description": "The company's stated mission."
      },
      "case_studies": {
        "type": "array",
        "items": {
          "type": "object",
          "properties": {
            "title": {
              "type": "string"
            },
            "url": {
              "type": "string"
            }
          },
          "required": [
            "title",
            "url"
          ],
          "additionalProperties": false
        }
      }
    },
    "required": [
      "mission_statement",
      "case_studies"
    ],
    "additionalProperties": false
  },
  "instructions": "<string>",
  "factCheck": false,
  "followSubdomains": false,
  "maxPages": 5,
  "maxDepth": 1,
  "pdf": {
    "shouldParse": true
  },
  "includeFrames": false,
  "maxAgeMs": 604800000,
  "waitForMs": 15000,
  "settleAnimations": false,
  "stopAfterMs": 80000,
  "timeoutMS": 150500,
  "tags": [
    "production",
    "team-alpha"
  ]
}
EOF

HttpResponse<String> response = Unirest.post("https://api.context.dev/v1/web/extract")
  .header("Authorization", "Bearer <token>")
  .header("Content-Type", "application/json")
  .body("{\n  \"url\": \"<string>\",\n  \"schema\": {\n    \"type\": \"object\",\n    \"properties\": {\n      \"mission_statement\": {\n        \"type\": \"string\",\n        \"description\": \"The company's stated mission.\"\n      },\n      \"case_studies\": {\n        \"type\": \"array\",\n        \"items\": {\n          \"type\": \"object\",\n          \"properties\": {\n            \"title\": {\n              \"type\": \"string\"\n            },\n            \"url\": {\n              \"type\": \"string\"\n            }\n          },\n          \"required\": [\n            \"title\",\n            \"url\"\n          ],\n          \"additionalProperties\": false\n        }\n      }\n    },\n    \"required\": [\n      \"mission_statement\",\n      \"case_studies\"\n    ],\n    \"additionalProperties\": false\n  },\n  \"instructions\": \"<string>\",\n  \"factCheck\": false,\n  \"followSubdomains\": false,\n  \"maxPages\": 5,\n  \"maxDepth\": 1,\n  \"pdf\": {\n    \"shouldParse\": true\n  },\n  \"includeFrames\": false,\n  \"maxAgeMs\": 604800000,\n  \"waitForMs\": 15000,\n  \"settleAnimations\": false,\n  \"stopAfterMs\": 80000,\n  \"timeoutMS\": 150500,\n  \"tags\": [\n    \"production\",\n    \"team-alpha\"\n  ]\n}")
  .asString();

{
  "status": "<string>",
  "url": "<string>",
  "urls_analyzed": [
    "<string>"
  ],
  "data": {},
  "metadata": {
    "numUrls": 123,
    "maxCrawlDepth": 123,
    "numSucceeded": 123,
    "numFailed": 123,
    "numSkipped": 123,
    "numBlocked": 123
  },
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "error_code": "RATE_LIMITED",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

{
  "message": "<string>",
  "key_metadata": {
    "credits_consumed": 123,
    "credits_remaining": 123
  }
}

10 Credits

Authorizations

Authorization

string

header

required

Bearer authentication header of the form Bearer <token>, where <token> is your auth token.

Body

application/json

url

string<uri>

required

The starting website URL to crawl and extract from. Must include http:// or https://.

schema

object

required

JSON Schema for the returned data object. TypeScript Zod users can pass a JSON Schema generated from a Zod object; Python users can pass the equivalent JSON Schema object.

Example:

{
  "type": "object",
  "properties": {
    "mission_statement": {
      "type": "string",
      "description": "The company's stated mission."
    },
    "case_studies": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "title": { "type": "string" },
          "url": { "type": "string" }
        },
        "required": ["title", "url"],
        "additionalProperties": false
      }
    }
  },
  "required": ["mission_statement", "case_studies"],
  "additionalProperties": false
}

instructions

string

Optional extraction guidance, such as which facts to prioritize or how to interpret fields in the schema.

Maximum string length: 2000

factCheck

boolean

default:false

When true, every returned value must be grounded in facts stated on the page; fields that cannot be supported by the page are returned as null/empty. When false (default), the model may make reasonable inferences and derivations from the page content (e.g. ideal customer, competitor analysis, recommendations) while keeping verifiable specifics (names, quotes, URLs, dates, metrics) faithful to the source.

followSubdomains

boolean

default:false

When true, follow links on subdomains of the starting URL's domain.

maxPages

integer

default:5

Maximum number of pages to analyze for extraction. Hard cap: 50. Defaults to 5.

Required range: 1 <= x <= 50

maxDepth

integer

Optional maximum link depth from the starting URL (0 = only the starting page). If omitted, there is no crawl depth limit.

Required range: x >= 0

pdf

object

Show child attributes

includeFrames

boolean

default:false

When true, iframe contents are included in Markdown before extraction.

maxAgeMs

integer

default:604800000

Return cached scrape results if a prior scrape for the same parameters is younger than this many milliseconds. Defaults to 7 days (604800000 ms).

Required range: 0 <= x <= 2592000000

waitForMs

integer

Optional browser wait time in milliseconds after initial page load for each crawled page.

Required range: 0 <= x <= 30000

settleAnimations

boolean

default:false

When true, waits briefly for CSS and transition animations to settle before extracting each crawled page. Defaults to false. This adds a bit of latency in exchange for more stable output on animated pages.

stopAfterMs

integer

default:80000

Soft time budget for the crawl in milliseconds. Min: 10000 (10s). Max: 110000 (110s). Default: 80000 (80s).

Required range: 10000 <= x <= 110000

timeoutMS

integer

Optional timeout in milliseconds for the request. If the request takes longer than this value, it will be aborted with a 408 status code. Maximum allowed value is 300000ms (5 minutes).

Required range: 1000 <= x <= 300000

Response

Successful response

status

string

required

Status of the response, e.g., 'ok'

url

string

required

The starting URL that was analyzed

urls_analyzed

string[]

required

List of URLs whose Markdown was used for extraction

data

object

required

Extracted data matching the request schema

metadata

object

required

Show child attributes

key_metadata

object

Metadata about the API key used for the request. Included in every response whenever a valid API key is provided, even when the response status is not 200.

Show child attributes

Search the Web

Parse File Bytes to Markdown

⌘I

Web Extraction

Brand Intelligence

Entity Enrichment

Monitor Infrastructure

Utility

Extract Structured Data

Authorizations

Body

Response