> ## Documentation Index
> Fetch the complete documentation index at: https://docs.context.dev/llms.txt
> Use this file to discover all available pages before exploring further.

# Extract Structured Website Data

> Crawl a website, use the provided JSON Schema and instructions to prioritize relevant internal links, and extract structured data from the selected pages.

<Badge color="blue">10 Credits</Badge>


## OpenAPI

````yaml https://app.stainless.com/api/spec/documented/context.dev/openapi.documented.yml post /web/extract
openapi: 3.0.0
info:
  title: Context API
  description: API for retrieving context data from any website
  version: 1.0.0
servers:
  - url: https://api.context.dev/v1
security: []
paths:
  /web/extract:
    post:
      tags:
        - Web Extraction
      summary: Extract Structured Website Data
      description: >-
        Crawl a website, use the provided JSON Schema and instructions to
        prioritize relevant internal links, and extract structured data from the
        selected pages.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                url:
                  type: string
                  format: uri
                  description: >-
                    The starting website URL to crawl and extract from. Must
                    include http:// or https://.
                schema:
                  type: object
                  description: >-
                    JSON Schema for the returned data object. TypeScript Zod
                    users can pass a JSON Schema generated from a Zod object;
                    Python users can pass the equivalent JSON Schema object.
                  additionalProperties: true
                  example:
                    type: object
                    properties:
                      mission_statement:
                        type: string
                        description: The company's stated mission.
                      case_studies:
                        type: array
                        items:
                          type: object
                          properties:
                            title:
                              type: string
                            url:
                              type: string
                          required:
                            - title
                            - url
                          additionalProperties: false
                    required:
                      - mission_statement
                      - case_studies
                    additionalProperties: false
                instructions:
                  type: string
                  maxLength: 2000
                  description: >-
                    Optional extraction guidance, such as which facts to
                    prioritize or how to interpret fields in the schema.
                factCheck:
                  type: boolean
                  default: false
                  description: >-
                    When true, every returned value must be grounded in facts
                    stated on the page; fields that cannot be supported by the
                    page are returned as null/empty. When false (default), the
                    model may make reasonable inferences and derivations from
                    the page content (e.g. ideal customer, competitor analysis,
                    recommendations) while keeping verifiable specifics (names,
                    quotes, URLs, dates, metrics) faithful to the source.
                followSubdomains:
                  type: boolean
                  default: false
                  description: >-
                    When true, follow links on subdomains of the starting URL's
                    domain.
                maxPages:
                  type: integer
                  minimum: 1
                  maximum: 50
                  default: 5
                  description: >-
                    Maximum number of pages to analyze for extraction. Hard cap:
                    50. Defaults to 5.
                maxDepth:
                  type: integer
                  minimum: 0
                  description: >-
                    Optional maximum link depth from the starting URL (0 = only
                    the starting page). If omitted, there is no crawl depth
                    limit.
                pdf:
                  type: object
                  properties:
                    shouldParse:
                      type: boolean
                      default: true
                      description: >-
                        When true, PDF pages are fetched and parsed. When false,
                        PDF pages are skipped.
                    start:
                      type: integer
                      minimum: 1
                      description: First 1-based PDF page to parse.
                    end:
                      type: integer
                      minimum: 1
                      description: >-
                        Last 1-based PDF page to parse. Must be greater than or
                        equal to start when both are provided.
                  additionalProperties: false
                  default:
                    shouldParse: true
                includeFrames:
                  type: boolean
                  default: false
                  description: >-
                    When true, iframe contents are included in Markdown before
                    extraction.
                maxAgeMs:
                  type: integer
                  minimum: 0
                  maximum: 2592000000
                  default: 604800000
                  description: >-
                    Return cached scrape results if a prior scrape for the same
                    parameters is younger than this many milliseconds. Defaults
                    to 7 days (604800000 ms).
                waitForMs:
                  type: integer
                  minimum: 0
                  maximum: 30000
                  description: >-
                    Optional browser wait time in milliseconds after initial
                    page load for each crawled page.
                stopAfterMs:
                  type: integer
                  minimum: 10000
                  maximum: 110000
                  default: 80000
                  description: >-
                    Soft time budget for the crawl in milliseconds. Min: 10000
                    (10s). Max: 110000 (110s). Default: 80000 (80s).
                timeoutMS:
                  $ref: '#/components/schemas/TimeoutMS'
              required:
                - url
                - schema
              additionalProperties: false
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                type: object
                properties:
                  status:
                    type: string
                    description: Status of the response, e.g., 'ok'
                  url:
                    type: string
                    description: The starting URL that was analyzed
                  urls_analyzed:
                    type: array
                    description: List of URLs whose Markdown was used for extraction
                    items:
                      type: string
                  data:
                    type: object
                    description: Extracted data matching the request schema
                    additionalProperties: true
                  metadata:
                    type: object
                    properties:
                      numUrls:
                        type: integer
                      maxCrawlDepth:
                        type: integer
                      numSucceeded:
                        type: integer
                      numFailed:
                        type: integer
                      numSkipped:
                        type: integer
                    required:
                      - numUrls
                      - maxCrawlDepth
                      - numSucceeded
                      - numFailed
                      - numSkipped
                  key_metadata:
                    $ref: '#/components/schemas/KeyMetadata'
                required:
                  - status
                  - url
                  - urls_analyzed
                  - data
                  - metadata
        '400':
          description: Bad request - Invalid URL, schema, or inaccessible website
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                  error_code:
                    type: string
                    enum:
                      - INPUT_VALIDATION_ERROR
                      - WEBSITE_ACCESS_ERROR
                  key_metadata:
                    $ref: '#/components/schemas/KeyMetadata'
                required:
                  - message
                  - error_code
        '401':
          description: Unauthorized - Invalid or missing API key
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '403':
          description: Forbidden - Insufficient permissions or usage limit exceeded
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '408':
          description: Request timeout
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '429':
          $ref: '#/components/responses/RateLimited'
        '500':
          description: Internal server error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
      security:
        - bearerAuth: []
      x-codeSamples:
        - lang: JavaScript
          source: |-
            import ContextDev from 'context.dev';

            const client = new ContextDev({
              apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
            });

            const response = await client.web.extract({
              schema: {
                type: 'bar',
                properties: 'bar',
                required: 'bar',
                additionalProperties: 'bar',
              },
              url: 'https://example.com',
            });

            console.log(response.data);
        - lang: Python
          source: |-
            import os
            from context.dev import ContextDev

            client = ContextDev(
                api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
            )
            response = client.web.extract(
                schema={
                    "type": "bar",
                    "properties": "bar",
                    "required": "bar",
                    "additionalProperties": "bar",
                },
                url="https://example.com",
            )
            print(response.data)
        - lang: Go
          source: "package main\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\n\t\"github.com/context-dot-dev/context-go-sdk\"\n\t\"github.com/context-dot-dev/context-go-sdk/option\"\n)\n\nfunc main() {\n\tclient := contextdev.NewClient(\n\t\toption.WithAPIKey(\"My API Key\"),\n\t)\n\tresponse, err := client.Web.Extract(context.TODO(), contextdev.WebExtractParams{\n\t\tSchema: map[string]any{\n\t\t\t\"type\":                 \"bar\",\n\t\t\t\"properties\":           \"bar\",\n\t\t\t\"required\":             \"bar\",\n\t\t\t\"additionalProperties\": \"bar\",\n\t\t},\n\t\tURL: \"https://example.com\",\n\t})\n\tif err != nil {\n\t\tpanic(err.Error())\n\t}\n\tfmt.Printf(\"%+v\\n\", response.Data)\n}\n"
        - lang: Ruby
          source: |-
            require "context_dev"

            context_dev = ContextDev::Client.new(api_key: "My API Key")

            response = context_dev.web.extract(
              schema: {type: "bar", properties: "bar", required: "bar", additionalProperties: "bar"},
              url: "https://example.com"
            )

            puts(response)
        - lang: CLI
          source: |-
            context-dev web extract \
              --api-key 'My API Key' \
              --schema '{type: bar, properties: bar, required: bar, additionalProperties: bar}' \
              --url https://example.com
components:
  schemas:
    TimeoutMS:
      type: integer
      minimum: 1000
      maximum: 300000
      description: >-
        Optional timeout in milliseconds for the request. If the request takes
        longer than this value, it will be aborted with a 408 status code.
        Maximum allowed value is 300000ms (5 minutes).
    KeyMetadata:
      type: object
      description: >-
        Metadata about the API key used for the request. Included in every
        response whenever a valid API key is provided, even when the response
        status is not 200.
      properties:
        credits_consumed:
          type: integer
          description: The number of credits consumed by this request.
        credits_remaining:
          type: integer
          description: >-
            The number of credits remaining for your organization after this
            request.
      required:
        - credits_consumed
        - credits_remaining
    ErrorResponse:
      type: object
      properties:
        message:
          type: string
          description: Error message
        error_code:
          type: string
          enum:
            - INTERNAL_ERROR
            - VALID
            - NOT_FOUND
            - FORBIDDEN
            - USAGE_EXCEEDED
            - RATE_LIMITED
            - UNAUTHORIZED
            - DISABLED
            - INSUFFICIENT_PERMISSIONS
            - TIMEOUT_EXCEEDS_MAXIMUM
            - WEBSITE_ACCESS_ERROR
            - EXTERNAL_PROVIDER_ERROR
            - INPUT_VALIDATION_ERROR
            - REQUEST_TIMEOUT
          description: Error code indicating the type of error
        key_metadata:
          $ref: '#/components/schemas/KeyMetadata'
  responses:
    RateLimited:
      description: Rate limit exceeded
      headers:
        Retry-After:
          description: Seconds until the per-minute rate limit window resets
          schema:
            type: integer
            minimum: 1
            maximum: 60
      content:
        application/json:
          schema:
            type: object
            properties:
              message:
                type: string
                description: Error message
              error_code:
                type: string
                enum:
                  - RATE_LIMITED
                description: Error code indicating the rate limit was exceeded
              key_metadata:
                $ref: '#/components/schemas/KeyMetadata'
            required:
              - message
              - error_code
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer

````