PDF Hierarchy Detection¶
Classifies text blocks in a PDF into heading levels (H1–H6) and body text based on font size analysis. Uses K-means clustering to group font sizes, then assigns heading levels by rank — largest cluster becomes H1, second-largest becomes H2, and so on.
Quick Start¶
Python
from kreuzberg import extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig
config: ExtractionConfig = ExtractionConfig(
pdf_options=PdfConfig(
extract_metadata=True,
hierarchy=HierarchyConfig(
enabled=True,
k_clusters=6,
include_bbox=True,
ocr_coverage_threshold=0.8
)
)
)
result = extract_file_sync("document.pdf", config=config)
# Access hierarchy information
for page in result.pages or []:
print(f"Page {page.page_number}:")
print(f" Content: {page.content[:100]}...")
TypeScript
import { extractFile } from '@kreuzberg/node';
const config = {
pdfOptions: {
extractMetadata: true,
hierarchy: {
enabled: true,
kClusters: 6,
includeBbox: true,
ocrCoverageThreshold: 0.8,
},
},
};
const result = await extractFile('document.pdf', null, config);
if (result.pages) {
result.pages.forEach((page) => {
console.log(`Page ${page.pageNumber}:`);
console.log(` Content: ${page.content.substring(0, 100)}...`);
});
}
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig, PdfConfig, HierarchyConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
pdf_options: Some(PdfConfig {
hierarchy: Some(HierarchyConfig {
enabled: true,
detection_threshold: Some(0.75),
ocr_coverage_threshold: Some(0.8),
min_level: Some(1),
max_level: Some(5),
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("document.pdf", None::<&str>, &config)?;
println!("Hierarchy levels: {}", result.hierarchy.len());
Ok(())
}
Go
package main
import "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
func main() {
// Basic hierarchy configuration
config := &kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
Hierarchy: &kreuzberg.HierarchyConfig{
Enabled: kreuzberg.BoolPtr(true),
KClusters: kreuzberg.IntPtr(6),
IncludeBbox: kreuzberg.BoolPtr(true),
OcrCoverageThreshold: kreuzberg.Float64Ptr(0.8),
},
},
}
// Advanced hierarchy configuration with more clusters
advancedConfig := &kreuzberg.ExtractionConfig{
PdfOptions: &kreuzberg.PdfConfig{
ExtractImages: true,
Hierarchy: &kreuzberg.HierarchyConfig{
Enabled: kreuzberg.BoolPtr(true),
KClusters: kreuzberg.IntPtr(12),
IncludeBbox: kreuzberg.BoolPtr(true),
OcrCoverageThreshold: kreuzberg.Float64Ptr(0.8),
},
},
}
_ = config
_ = advancedConfig
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.PdfConfig;
import dev.kreuzberg.config.HierarchyConfig;
ExtractionConfig config = ExtractionConfig.builder()
.pdfOptions(PdfConfig.builder()
.hierarchyConfig(HierarchyConfig.builder()
.enabled(true)
.detectionThreshold(0.75)
.ocrCoverageThreshold(0.8)
.minLevel(1)
.maxLevel(5)
.build())
.build())
.build();
C#
using Kreuzberg;
// Basic hierarchy configuration with properties
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
Hierarchy = new HierarchyConfig
{
Enabled = true,
KClusters = 6,
IncludeBbox = true,
OcrCoverageThreshold = 0.8f
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length}");
// Advanced hierarchy detection with custom parameters
var advancedConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
ExtractImages = true,
Hierarchy = new HierarchyConfig
{
Enabled = true,
KClusters = 12, // More clusters for detailed hierarchy
IncludeBbox = true, // Include bounding box coordinates
OcrCoverageThreshold = 0.7f // Higher OCR threshold for stricter detection
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("complex_document.pdf", advancedConfig);
Console.WriteLine($"Advanced hierarchy detection completed: {result.Content.Length} chars");
// Minimal configuration with only enabled flag
var minimalConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
Hierarchy = new HierarchyConfig
{
Enabled = true;
// Other properties use defaults:
// KClusters = 6
// IncludeBbox = true
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", minimalConfig);
Console.WriteLine("Extraction with default hierarchy settings complete");
// Disabling hierarchy detection
var noHierarchyConfig = new ExtractionConfig
{
PdfOptions = new PdfConfig
{
Hierarchy = new HierarchyConfig
{
Enabled = false
}
}
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", noHierarchyConfig);
Console.WriteLine("Extraction without hierarchy detection complete");
Ruby
require 'kreuzberg'
# Using keyword arguments with defaults
config = Kreuzberg::Config::Extraction.new(
pdf_options: Kreuzberg::Config::PDF.new(
extract_images: true,
hierarchy: Kreuzberg::Config::Hierarchy.new(
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.8
)
)
)
# Using hash syntax alternative
config = Kreuzberg::Config::Extraction.new(
pdf_options: Kreuzberg::Config::PDF.new(
extract_images: true,
hierarchy: {
enabled: true,
k_clusters: 6,
include_bbox: true,
ocr_coverage_threshold: 0.8
}
)
)
Output¶
Hierarchy data is in result.pages[n].hierarchy. Each page has a blocks list:
{
"block_count": 4,
"blocks": [
{ "text": "Chapter 1: Introduction", "level": "h1", "font_size": 24.0, "bbox": [50.0, 100.0, 400.0, 125.0] },
{ "text": "Background", "level": "h2", "font_size": 18.0, "bbox": [50.0, 150.0, 300.0, 168.0] },
{ "text": "This chapter provides...", "level": "body", "font_size": 12.0, "bbox": [50.0, 200.0, 550.0, 450.0] }
]
}
bbox:[left, top, right, bottom]in PDF points (present wheninclude_bbox=True). This is the only way to obtain bounding box coordinates for text elements — they are not included by default.level:"h1"–"h6"or"body"
Configuration¶
| Parameter | Type | Default | Description |
|---|---|---|---|
enabled |
bool |
true |
Enable hierarchy extraction |
k_clusters |
int |
6 |
Font size clusters (2–10), maps to heading levels |
include_bbox |
bool |
true |
Include bounding box coordinates |
ocr_coverage_threshold |
float \| None |
None |
Trigger OCR if text coverage is below this fraction |
Choosing k_clusters¶
k_clusters |
Heading levels | Use when |
|---|---|---|
| 2–3 | H1–H2 | Simple documents with 1–2 heading sizes |
| 4–5 | H1–H4 | Standard documents |
| 6 (default) | H1–H6 | Most documents |
| 7–8 | H1–H6+ | Books, specs with deep nesting |
ocr_coverage_threshold¶
| Threshold | Behavior |
|---|---|
None |
OCR never triggered by coverage |
0.3 |
OCR if < 30% of page has text |
0.5 |
OCR if < 50% of page has text |
Requires an OCR backend to be configured separately.
Troubleshooting¶
hierarchyisNone— Checkhierarchy.enabledisTrue. If the PDF is image-only, enable OCR. If fewer text blocks thank_clusters, reducek_clusters.- Most blocks classified as
body— Document may use uniform font sizes. Reducek_clusters(try 3–4). - Heading levels don't match visual inspection — Levels are assigned by font size rank, not absolute size. Filter on
block.font_sizedirectly for absolute thresholds.
See the HierarchyConfig reference for the full parameter list.