Skip to content

PDF Hierarchy Detection

Classifies text blocks in a PDF into heading levels (H1–H6) and body text based on font size analysis. Uses K-means clustering to group font sizes, then assigns heading levels by rank — largest cluster becomes H1, second-largest becomes H2, and so on.

Quick Start

Python
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig

config: ExtractionConfig = ExtractionConfig(
    pdf_options=PdfConfig(
        extract_metadata=True,
        hierarchy=HierarchyConfig(
            enabled=True,
            k_clusters=6,
            include_bbox=True,
            ocr_coverage_threshold=0.8
        )
    )
)

result = extract_file_sync("document.pdf", config=config)

# Access hierarchy information
for page in result.pages or []:
    print(f"Page {page.page_number}:")
    print(f"  Content: {page.content[:100]}...")
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    pdfOptions: {
        extractMetadata: true,
        hierarchy: {
            enabled: true,
            kClusters: 6,
            includeBbox: true,
            ocrCoverageThreshold: 0.8,
        },
    },
};

const result = await extractFile('document.pdf', null, config);
if (result.pages) {
    result.pages.forEach((page) => {
        console.log(`Page ${page.pageNumber}:`);
        console.log(`  Content: ${page.content.substring(0, 100)}...`);
    });
}
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        pdf_options: Some(PdfConfig {
            hierarchy: Some(HierarchyConfig {
                enabled: true,
                detection_threshold: Some(0.75),
                ocr_coverage_threshold: Some(0.8),
                min_level: Some(1),
                max_level: Some(5),
            }),
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
    println!("Hierarchy levels: {}", result.hierarchy.len());
    Ok(())
}
Go
package main

import "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"

func main() {
    // Basic hierarchy configuration
    config := &kreuzberg.ExtractionConfig{
        PdfOptions: &kreuzberg.PdfConfig{
            ExtractImages: true,
            Hierarchy: &kreuzberg.HierarchyConfig{
                Enabled:               kreuzberg.BoolPtr(true),
                KClusters:             kreuzberg.IntPtr(6),
                IncludeBbox:           kreuzberg.BoolPtr(true),
                OcrCoverageThreshold:  kreuzberg.Float64Ptr(0.8),
            },
        },
    }

    // Advanced hierarchy configuration with more clusters
    advancedConfig := &kreuzberg.ExtractionConfig{
        PdfOptions: &kreuzberg.PdfConfig{
            ExtractImages: true,
            Hierarchy: &kreuzberg.HierarchyConfig{
                Enabled:               kreuzberg.BoolPtr(true),
                KClusters:             kreuzberg.IntPtr(12),
                IncludeBbox:           kreuzberg.BoolPtr(true),
                OcrCoverageThreshold:  kreuzberg.Float64Ptr(0.8),
            },
        },
    }

    _ = config
    _ = advancedConfig
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.PdfConfig;
import dev.kreuzberg.config.HierarchyConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .pdfOptions(PdfConfig.builder()
        .hierarchyConfig(HierarchyConfig.builder()
            .enabled(true)
            .detectionThreshold(0.75)
            .ocrCoverageThreshold(0.8)
            .minLevel(1)
            .maxLevel(5)
            .build())
        .build())
    .build();
C#
using Kreuzberg;

// Basic hierarchy configuration with properties
var config = new ExtractionConfig
{
    PdfOptions = new PdfConfig
    {
        ExtractImages = true,
        Hierarchy = new HierarchyConfig
        {
            Enabled = true,
            KClusters = 6,
            IncludeBbox = true,
            OcrCoverageThreshold = 0.8f
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length}");

// Advanced hierarchy detection with custom parameters
var advancedConfig = new ExtractionConfig
{
    PdfOptions = new PdfConfig
    {
        ExtractImages = true,
        Hierarchy = new HierarchyConfig
        {
            Enabled = true,
            KClusters = 12,           // More clusters for detailed hierarchy
            IncludeBbox = true,       // Include bounding box coordinates
            OcrCoverageThreshold = 0.7f  // Higher OCR threshold for stricter detection
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("complex_document.pdf", advancedConfig);
Console.WriteLine($"Advanced hierarchy detection completed: {result.Content.Length} chars");

// Minimal configuration with only enabled flag
var minimalConfig = new ExtractionConfig
{
    PdfOptions = new PdfConfig
    {
        Hierarchy = new HierarchyConfig
        {
            Enabled = true;
            // Other properties use defaults:
            // KClusters = 6
            // IncludeBbox = true
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", minimalConfig);
Console.WriteLine("Extraction with default hierarchy settings complete");

// Disabling hierarchy detection
var noHierarchyConfig = new ExtractionConfig
{
    PdfOptions = new PdfConfig
    {
        Hierarchy = new HierarchyConfig
        {
            Enabled = false
        }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", noHierarchyConfig);
Console.WriteLine("Extraction without hierarchy detection complete");
Ruby
require 'kreuzberg'

# Using keyword arguments with defaults
config = Kreuzberg::Config::Extraction.new(
  pdf_options: Kreuzberg::Config::PDF.new(
    extract_images: true,
    hierarchy: Kreuzberg::Config::Hierarchy.new(
      enabled: true,
      k_clusters: 6,
      include_bbox: true,
      ocr_coverage_threshold: 0.8
    )
  )
)

# Using hash syntax alternative
config = Kreuzberg::Config::Extraction.new(
  pdf_options: Kreuzberg::Config::PDF.new(
    extract_images: true,
    hierarchy: {
      enabled: true,
      k_clusters: 6,
      include_bbox: true,
      ocr_coverage_threshold: 0.8
    }
  )
)

Output

Hierarchy data is in result.pages[n].hierarchy. Each page has a blocks list:

{
  "block_count": 4,
  "blocks": [
    { "text": "Chapter 1: Introduction", "level": "h1", "font_size": 24.0, "bbox": [50.0, 100.0, 400.0, 125.0] },
    { "text": "Background", "level": "h2", "font_size": 18.0, "bbox": [50.0, 150.0, 300.0, 168.0] },
    { "text": "This chapter provides...", "level": "body", "font_size": 12.0, "bbox": [50.0, 200.0, 550.0, 450.0] }
  ]
}
  • bbox: [left, top, right, bottom] in PDF points (present when include_bbox=True). This is the only way to obtain bounding box coordinates for text elements — they are not included by default.
  • level: "h1""h6" or "body"

Configuration

Parameter Type Default Description
enabled bool true Enable hierarchy extraction
k_clusters int 6 Font size clusters (2–10), maps to heading levels
include_bbox bool true Include bounding box coordinates
ocr_coverage_threshold float \| None None Trigger OCR if text coverage is below this fraction

Choosing k_clusters

k_clusters Heading levels Use when
2–3 H1–H2 Simple documents with 1–2 heading sizes
4–5 H1–H4 Standard documents
6 (default) H1–H6 Most documents
7–8 H1–H6+ Books, specs with deep nesting

ocr_coverage_threshold

Threshold Behavior
None OCR never triggered by coverage
0.3 OCR if < 30% of page has text
0.5 OCR if < 50% of page has text

Requires an OCR backend to be configured separately.

Troubleshooting

  • hierarchy is None — Check hierarchy.enabled is True. If the PDF is image-only, enable OCR. If fewer text blocks than k_clusters, reduce k_clusters.
  • Most blocks classified as body — Document may use uniform font sizes. Reduce k_clusters (try 3–4).
  • Heading levels don't match visual inspection — Levels are assigned by font size rank, not absolute size. Filter on block.font_size directly for absolute thresholds.

See the HierarchyConfig reference for the full parameter list.