> ## Documentation Index
> Fetch the complete documentation index at: https://docs.context.dev/llms.txt
> Use this file to discover all available pages before exploring further.

# Crawl Website & Scrape Markdown

> Performs a crawl starting from a given URL, extracts page content as Markdown, and returns results for all crawled pages.

<Badge color="blue">1 Credit Per Page</Badge>


## OpenAPI

````yaml https://app.stainless.com/api/spec/documented/context.dev/openapi.documented.yml post /web/crawl
openapi: 3.0.0
info:
  title: Context API
  description: API for retrieving context data from any website
  version: 1.0.0
servers:
  - url: https://api.context.dev/v1
security: []
paths:
  /web/crawl:
    post:
      tags:
        - Web Scraping
      summary: Crawl Website & Scrape Markdown
      description: >-
        Performs a crawl starting from a given URL, extracts page content as
        Markdown, and returns results for all crawled pages.
      requestBody:
        required: true
        content:
          application/json:
            schema:
              type: object
              properties:
                url:
                  type: string
                  format: uri
                  description: >-
                    The starting URL for the crawl (must include http:// or
                    https:// protocol)
                maxPages:
                  type: integer
                  minimum: 1
                  maximum: 500
                  default: 100
                  description: 'Maximum number of pages to crawl. Hard cap: 500.'
                maxDepth:
                  type: integer
                  minimum: 0
                  description: >-
                    Maximum link depth from the starting URL (0 = only the
                    starting page)
                urlRegex:
                  type: string
                  description: >-
                    Regex pattern. Only URLs matching this pattern will be
                    followed and scraped.
                  example: ^https?://[^/]+/blog/
                includeLinks:
                  type: boolean
                  default: true
                  description: Preserve hyperlinks in the Markdown output
                includeImages:
                  type: boolean
                  default: false
                  description: Include image references in the Markdown output
                shortenBase64Images:
                  type: boolean
                  default: true
                  description: Truncate base64-encoded image data in the Markdown output
                useMainContentOnly:
                  type: boolean
                  default: false
                  description: >-
                    Extract only the main content, stripping headers, footers,
                    sidebars, and navigation
                followSubdomains:
                  type: boolean
                  default: false
                  description: >-
                    When true, follow links on subdomains of the starting URL's
                    domain (e.g. docs.example.com when starting from
                    example.com). www and apex are always treated as equivalent.
                parsePDF:
                  type: boolean
                  default: true
                  description: >-
                    When true (default), PDF pages are fetched and their text
                    layer is extracted and converted to Markdown alongside HTML
                    pages. When false, PDF pages are skipped entirely (not
                    included in results and not counted as failures).
                includeFrames:
                  type: boolean
                  default: false
                  description: >-
                    When true, the contents of iframes are rendered to Markdown
                    for each crawled page.
                maxAgeMs:
                  type: integer
                  minimum: 0
                  maximum: 2592000000
                  default: 86400000
                  description: >-
                    Return a cached result if a prior scrape for the same
                    parameters exists and is younger than this many
                    milliseconds. Defaults to 1 day (86400000 ms) when omitted.
                    Max is 30 days (2592000000 ms). Set to 0 to always scrape
                    fresh.
                waitForMs:
                  type: integer
                  minimum: 0
                  maximum: 30000
                  description: >-
                    Optional browser wait time in milliseconds after initial
                    page load for each crawled page. Min: 0. Max: 30000 (30
                    seconds). 
                timeoutMS:
                  $ref: '#/components/schemas/TimeoutMS'
              required:
                - url
      responses:
        '200':
          description: Successful response
          content:
            application/json:
              schema:
                type: object
                properties:
                  results:
                    type: array
                    items:
                      type: object
                      properties:
                        markdown:
                          type: string
                          description: >-
                            Extracted page content as Markdown (empty string on
                            failure)
                        metadata:
                          type: object
                          properties:
                            url:
                              type: string
                              description: The URL that was fetched
                            title:
                              type: string
                              description: >-
                                The page's <title> content (empty string if
                                unavailable)
                            crawlDepth:
                              type: integer
                              description: >-
                                Depth relative to the start URL. 0 = start URL,
                                1 = one link away.
                            statusCode:
                              type: integer
                              description: HTTP status code of the response
                            success:
                              type: boolean
                              description: >-
                                true if the page was fetched and parsed
                                successfully
                          required:
                            - url
                            - title
                            - crawlDepth
                            - statusCode
                            - success
                      required:
                        - markdown
                        - metadata
                  metadata:
                    type: object
                    properties:
                      numUrls:
                        type: integer
                        description: Total number of URLs crawled
                      maxCrawlDepth:
                        type: integer
                        description: Maximum crawl depth reached during the crawl
                      numSucceeded:
                        type: integer
                        description: Number of pages successfully crawled
                      numFailed:
                        type: integer
                        description: Number of pages that failed to crawl
                      numSkipped:
                        type: integer
                        description: >-
                          Number of URLs skipped (PDFs when parsePDF=false, or
                          URLs not matching urlRegex)
                    required:
                      - numUrls
                      - maxCrawlDepth
                      - numSucceeded
                      - numFailed
                      - numSkipped
                required:
                  - results
                  - metadata
        '400':
          description: Bad request - Invalid URL or parameters
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    description: Error message describing the issue
                  error_code:
                    type: string
                    enum:
                      - INPUT_VALIDATION_ERROR
                      - WEBSITE_ACCESS_ERROR
                    description: Error code indicating the type of error
                required:
                  - message
                  - error_code
        '401':
          description: Unauthorized - Invalid or missing API key
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    description: Error message
                  error_code:
                    type: string
                    enum:
                      - UNAUTHORIZED
                    description: Error code indicating unauthorized access
        '403':
          description: Forbidden - Insufficient permissions or usage limit exceeded
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    description: Error message
                  error_code:
                    type: string
                    enum:
                      - FORBIDDEN
                      - USAGE_EXCEEDED
                      - DISABLED
                      - INSUFFICIENT_PERMISSIONS
                    description: Error code indicating forbidden access
        '408':
          description: Request timeout
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    description: Timeout error message
                  error_code:
                    type: string
                    enum:
                      - REQUEST_TIMEOUT
                    description: Error code indicating request timeout
        '500':
          description: Internal server error
          content:
            application/json:
              schema:
                type: object
                properties:
                  message:
                    type: string
                    description: Error message
                  error_code:
                    type: string
                    enum:
                      - INTERNAL_ERROR
                    description: Error code indicating internal server error
      security:
        - bearerAuth: []
      x-codeSamples:
        - lang: JavaScript
          source: >-
            import ContextDev from 'context.dev';


            const client = new ContextDev({
              apiKey: process.env['CONTEXT_DEV_API_KEY'], // This is the default and can be omitted
            });


            const response = await client.web.webCrawlMd({ url:
            'https://example.com' });


            console.log(response.metadata);
        - lang: Python
          source: |-
            import os
            from context.dev import ContextDev

            client = ContextDev(
                api_key=os.environ.get("CONTEXT_DEV_API_KEY"),  # This is the default and can be omitted
            )
            response = client.web.web_crawl_md(
                url="https://example.com",
            )
            print(response.metadata)
        - lang: Ruby
          source: |-
            require "context_dev"

            context_dev = ContextDev::Client.new(api_key: "My API Key")

            response = context_dev.web.web_crawl_md(url: "https://example.com")

            puts(response)
components:
  schemas:
    TimeoutMS:
      type: integer
      minimum: 1000
      maximum: 300000
      description: >-
        Optional timeout in milliseconds for the request. If the request takes
        longer than this value, it will be aborted with a 408 status code.
        Maximum allowed value is 300000ms (5 minutes).
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer

````