Skip to content

Document Structure

Represents a document as a flat list of nodes with explicit parent-child index references — a traversable tree with heading levels, content layers, inline annotations, and structured table grids.

Use document structure when you need hierarchical relationships between sections. For a flat list of semantic elements, use element-based output. For plain text, use the default unified output.

Comparison

Aspect Unified (default) Element-based Document structure
Output shape content: string elements: array nodes: array with index refs
Hierarchy None Inferred from levels Explicit parent/child indices
Inline annotations No No Bold, italic, links per node
Tables result.tables Table elements TableGrid with cell coords
Content layers Not classified Not classified body, header, footer, footnote
Best for LLM prompts, full-text RAG chunking Knowledge graphs, structured apps

Enable

Document Structure Config (Python)
from kreuzberg import extract_file_sync, ExtractionConfig

# Enable document structure output
config = ExtractionConfig(include_document_structure=True)

result = extract_file_sync("document.pdf", config=config)

# Access the document tree
if result.document:
    for node in result.document["nodes"]:
        node_type = node["content"]["node_type"]
        text = node["content"].get("text", "")
        print(f"[{node_type}] {text[:80]}")
Document Structure Config (TypeScript)
import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';

const config: ExtractionConfig = {
  includeDocumentStructure: true,
};

const result = extractFileSync("document.pdf", undefined, config);

if (result.document) {
  for (const node of result.document.nodes) {
    console.log(`[${node.content.nodeType}] ${node.content.text ?? ""}`);
  }
}
Document Structure Config (Rust)
use kreuzberg::{extract_file_sync, ExtractionConfig};

let config = ExtractionConfig {
    include_document_structure: true,
    ..Default::default()
};

let result = extract_file_sync("document.pdf", None, &config)?;

if let Some(document) = &result.document {
    for node in &document.nodes {
        let text = node.content.text().unwrap_or("");
        println!("[{}] {}", node.content.node_type_str(), &text[..text.len().min(80)]);
    }
}
Document Structure Config (Go)
package main

import (
    "fmt"
    kreuzberg "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    config := kreuzberg.NewExtractionConfig(
        kreuzberg.WithIncludeDocumentStructure(true),
    )

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        panic(err)
    }

    if result.Document != nil {
        for _, node := range result.Document.Nodes {
            fmt.Printf("[%s]\n", node.Content.NodeType)
        }
    }
}
Document Structure Config (Java)
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.ExtractionResult;

ExtractionConfig config = ExtractionConfig.builder()
    .includeDocumentStructure(true)
    .build();

ExtractionResult result = Kreuzberg.extractFileSync("document.pdf", config);

if (result.getDocumentStructure().isPresent()) {
    var document = result.getDocumentStructure().get();
    for (var node : document.nodes()) {
        System.out.println("[" + node.content().nodeType() + "]");
    }
}
Document Structure Config (C#)
using Kreuzberg;

var config = new ExtractionConfig
{
    IncludeDocumentStructure = true
};

var result = KreuzbergClient.ExtractFileSync("document.pdf", config);

if (result.Document is not null)
{
    foreach (var node in result.Document.Nodes)
    {
        Console.WriteLine($"[{node.Content.NodeType}]");
    }
}
Document Structure Config (Ruby)
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(include_document_structure: true)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)

if result.document
  result.document['nodes'].each do |node|
    node_type = node['content']['node_type']
    text = node['content']['text'] || ''
    puts "[#{node_type}] #{text[0...80]}"
  end
end
R
library(kreuzberg)

config <- extraction_config(
  include_document_structure = TRUE,
  output_format = "markdown"
)

file_path <- "document.pdf"
result <- extract_file_sync(file_path, config = config)

cat(sprintf("Total pages: %d\n", length(result$pages)))
cat(sprintf("MIME type: %s\n\n", result$mime_type))

for (i in seq_along(result$pages)) {
  page <- result$pages[[i]]
  cat(sprintf("Page %d structure:\n", i))
  cat(sprintf("  Content: %s\n", substr(page$content, 1, 100)))
  cat("\n")
}

Node Shape

Each node in result.document.nodes:

{
  "id": "node-a3f2b1c4",
  "content": { "node_type": "heading", "level": 2, "text": "Supervised Learning" },
  "parent": 0,
  "children": [4, 5, 6],
  "content_layer": "body",
  "page": 5,
  "page_end": null,
  "bbox": { "x0": 72.0, "y0": 600.0, "x1": 400.0, "y1": 620.0 },
  "annotations": []
}
  • parent and children are integer indices into the nodes array (null if absent)
  • bbox is present when bounding box data is available
  • annotations contains inline formatting spans

Node Types

node_type Key fields Notes
title text Document title
heading level (1–6), text Section heading
paragraph text Body paragraph; may have annotations
list ordered (bool) Container; children are list_item nodes
list_item text Child of list
table grid (TableGrid) Grid with cell-level data
image description, image_index image_index references result.images
code text, language Code block
quote (container) Children are typically paragraphs
formula text Math formula (plain text, LaTeX, or MathML)
footnote text Usually content_layer: "footnote"
group label, heading_level, heading_text Section grouping container
page_break (marker) Page boundary

Content Layers

Layer Description
body Main document content
header Page header area (repeated chapter titles)
footer Page footer area (page numbers, copyright)
footnote Footnotes and endnotes
for node in result.document["nodes"]:
    if node["content_layer"] == "body":
        process_main_content(node)

Text Annotations

Paragraphs carry a list of annotations marking character spans:

{ "start": 0, "end": 16, "kind": { "annotation_type": "bold" } }
annotation_type Extra fields
bold, italic, underline, strikethrough
code, subscript, superscript
link url, title (optional)
for node in result.document["nodes"]:
    for ann in node.get("annotations", []):
        text = node["content"].get("text", "")
        span = text[ann["start"]:ann["end"]]
        kind = ann["kind"]["annotation_type"]
        if kind == "link":
            print(f"Link: {span} -> {ann['kind']['url']}")
        else:
            print(f"{kind}: {span}")

Table Grid

Table nodes contain a grid with cell-level data:

{
  "rows": 3, "cols": 3,
  "cells": [
    { "content": "Algorithm", "row": 0, "col": 0, "row_span": 1, "col_span": 1, "is_header": true },
    { "content": "Decision Tree", "row": 1, "col": 0, "row_span": 1, "col_span": 1, "is_header": false }
  ]
}

Each cell has row, col, row_span, col_span, is_header, and optionally bbox.

for node in result.document["nodes"]:
    if node["content"]["node_type"] == "table":
        grid = node["content"]["grid"]
        rows, cols = grid["rows"], grid["cols"]
        table = [[None] * cols for _ in range(rows)]
        for cell in grid["cells"]:
            table[cell["row"]][cell["col"]] = cell["content"]
        for row in table:
            print(" | ".join(str(c or "") for c in row))