Skip to content

Element-Based Output v4.1.0

Segments a document into a flat array of typed elements — titles, paragraphs, tables, list items, code blocks, images, and more. Each element carries a page number and, for text elements in PDFs when hierarchy extraction is enabled, bounding box coordinates.

Use element-based output for RAG chunking, semantic search, or Unstructured.io-compatible pipelines. For hierarchical tree structure, use document structure. For plain text, use the default unified output.

Enable

Element-Based Output (Python)
from kreuzberg import extract_file_sync, ExtractionConfig

# Configure element-based output
config = ExtractionConfig(result_format="element_based")

# Extract document
result = extract_file_sync("document.pdf", config=config)

# Access elements
for element in result.elements:
    print(f"Type: {element.element_type}")
    print(f"Text: {element.text[:100]}")

    if element.metadata.page_number:
        print(f"Page: {element.metadata.page_number}")

    if element.metadata.coordinates:
        coords = element.metadata.coordinates
        print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")

    print("---")

# Filter by element type
titles = [e for e in result.elements if e.element_type == "title"]
for title in titles:
    level = title.metadata.additional.get("level", "unknown")
    print(f"[{level}] {title.text}")
Element-Based Output (TypeScript)
import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';

// Configure element-based output
const config: ExtractionConfig = {
  outputFormat: "element_based"
};

// Extract document
const result = extractFileSync("document.pdf", null, config);

// Access elements
for (const element of result.elements) {
  console.log(`Type: ${element.elementType}`);
  console.log(`Text: ${element.text.slice(0, 100)}`);

  if (element.metadata.pageNumber) {
    console.log(`Page: ${element.metadata.pageNumber}`);
  }

  if (element.metadata.coordinates) {
    const coords = element.metadata.coordinates;
    console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
  }

  console.log("---");
}

// Filter by element type
const titles = result.elements.filter(e => e.elementType === "title");
for (const title of titles) {
  const level = title.metadata.additional?.level || "unknown";
  console.log(`[${level}] ${title.text}`);
}
Element-Based Output (Rust)
use kreuzberg::{extract_file_sync, ExtractionConfig};
use kreuzberg::types::OutputFormat as ResultFormat;

fn main() -> kreuzberg::Result<()> {
    // Configure element-based output (result_format controls Unified vs ElementBased)
    let config = ExtractionConfig {
        result_format: ResultFormat::ElementBased,
        ..Default::default()
    };

    // Extract document
    let result = extract_file_sync("document.pdf", None, &config)?;

    // Access elements
    if let Some(elements) = result.elements {
        for element in &elements {
            println!("Type: {:?}", element.element_type);
            println!("Text: {}", &element.text[..100.min(element.text.len())]);

            if let Some(page) = element.metadata.page_number {
                println!("Page: {}", page);
            }

            if let Some(coords) = &element.metadata.coordinates {
                println!("Coords: ({}, {}) - ({}, {})",
                    coords.x0, coords.y0, coords.x1, coords.y1);
            }

            println!("---");
        }

        // Filter by element type
        let titles: Vec<_> = elements.iter()
            .filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
            .collect();

        for title in titles {
            let level = title.metadata.additional.get("level")
                .map(|v| v.as_ref())
                .unwrap_or("unknown");
            println!("[{}] {}", level, title.text);
        }
    }

    Ok(())
}
Element-Based Output (Go)
package main

import (
    "fmt"
    "kreuzberg"
)

func main() {
    // Configure element-based output
    config := &kreuzberg.ExtractionConfig{
        OutputFormat: "element_based",
    }

    // Extract document
    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        panic(err)
    }

    // Access elements
    for _, element := range result.Elements {
        fmt.Printf("Type: %s\n", element.ElementType)

        text := element.Text
        if len(text) > 100 {
            text = text[:100]
        }
        fmt.Printf("Text: %s\n", text)

        if element.Metadata.PageNumber != nil {
            fmt.Printf("Page: %d\n", *element.Metadata.PageNumber)
        }

        if element.Metadata.Coordinates != nil {
            coords := element.Metadata.Coordinates
            fmt.Printf("Coords: (%f, %f) - (%f, %f)\n",
                coords.Left, coords.Top, coords.Right, coords.Bottom)
        }

        fmt.Println("---")
    }

    // Filter by element type
    var titles []kreuzberg.Element
    for _, element := range result.Elements {
        if element.ElementType == "title" {
            titles = append(titles, element)
        }
    }

    for _, title := range titles {
        level, ok := title.Metadata.Additional["level"].(string)
        if !ok {
            level = "unknown"
        }
        fmt.Printf("[%s] %s\n", level, title.Text)
    }
}
Element-Based Output (Ruby)
require 'kreuzberg'

# Configure element-based output
config = Kreuzberg::Config::Extraction.new(output_format: 'element_based')

# Extract document
result = Kreuzberg.extract_file_sync('document.pdf', config: config)

# Access elements
result.elements.each do |element|
  puts "Type: #{element.element_type}"
  puts "Text: #{element.text[0...100]}"

  puts "Page: #{element.metadata.page_number}" if element.metadata.page_number

  if element.metadata.coordinates
    coords = element.metadata.coordinates
    puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
  end

  puts "---"
end

# Filter by element type
titles = result.elements.select { |e| e.element_type == 'title' }
titles.each do |title|
  level = title.metadata.additional['level'] || 'unknown'
  puts "[#{level}] #{title.text}"
end
R
library(kreuzberg)

config <- extraction_config(
  result_format = "element_based",
  output_format = "markdown"
)

file_path <- "document.pdf"
result <- extract_file_sync(file_path, config = config)

cat(sprintf("Total elements: %d\n\n", length(result$elements)))

for (i in seq_along(result$elements)) {
  element <- result$elements[[i]]
  cat(sprintf("Element %d:\n", i))
  cat(sprintf("  Type: %s\n", element$element_type))
  cat(sprintf("  Content: %s\n\n", substr(element$content, 1, 100)))
}
Element-Based Output (PHP)
<?php
use Kreuzberg\ExtractionConfig;
use Kreuzberg\Kreuzberg;

// Configure element-based output
$config = new ExtractionConfig();
$config->setOutputFormat('element_based');

// Extract document
$result = Kreuzberg::extractFileSync('document.pdf', $config);

// Access elements
foreach ($result->getElements() as $element) {
    echo "Type: " . $element->getElementType() . "\n";
    echo "Text: " . substr($element->getText(), 0, 100) . "\n";

    if ($element->getMetadata()->getPageNumber()) {
        echo "Page: " . $element->getMetadata()->getPageNumber() . "\n";
    }

    if ($element->getMetadata()->getCoordinates()) {
        $coords = $element->getMetadata()->getCoordinates();
        echo sprintf("Coords: (%s, %s) - (%s, %s)\n",
            $coords->getLeft(), $coords->getTop(),
            $coords->getRight(), $coords->getBottom());
    }

    echo "---\n";
}

// Filter by element type
$titles = array_filter($result->getElements(), function($e) {
    return $e->getElementType() === 'title';
});

foreach ($titles as $title) {
    $level = $title->getMetadata()->getAdditional()['level'] ?? 'unknown';
    echo "[{$level}] {$title->getText()}\n";
}
?>

Elements are in result.elements. Each element has element_id, element_type, text, and metadata.

Element Types

element_type Description Key additional fields
title Main title or top-level heading level (h1–h6), font_size, font_name
heading Section/subsection heading level (h1–h6)
narrative_text Body paragraph
list_item Bullet, numbered, or indented item list_type, list_marker, indent_level
table Tabular data row_count, column_count, format
image Embedded image format, width, height, alt_text
code_block Code snippet language, line_count
block_quote Quoted text
header Recurring page header position
footer Recurring page footer position
page_break Page boundary marker next_page

Metadata

Every element's metadata contains:

Field Type Description
page_number int \| None 1-indexed page number (PDF, DOCX, PPTX)
filename str \| None Source filename
coordinates BoundingBox \| None x0, y0, x1, y1 in PDF points. Only populated for text elements when pdf_options.hierarchy is enabled with include_bbox=True. Table and image elements do not carry coordinates.
element_index int Zero-based position in the elements array
additional dict[str, str] Element-type-specific fields (see table above)

PDF coordinates use bottom-left origin in points (1/72 inch).

Example Output

{
  "element_id": "elem-a3f2b1c4",
  "element_type": "title",
  "text": "Introduction to Machine Learning",
  "metadata": {
    "page_number": 1,
    "element_index": 0,
    "coordinates": { "x0": 72.0, "y0": 700.0, "x1": 540.0, "y1": 730.0 },
    "additional": { "level": "h1", "font_size": "24" }
  }
}

Filtering Elements

config = ExtractionConfig(result_format="element_based")
result = extract_file_sync("document.pdf", config=config)

titles = [e for e in result.elements if e.element_type == "title"]
tables = [e for e in result.elements if e.element_type == "table"]

for title in titles:
    level = title.metadata.additional.get("level", "h1")
    print(f"[{level}] {title.text}")

Unstructured.io Compatibility

Element-based output follows Unstructured.io's element array structure. Key differences when migrating:

Aspect Unstructured.io Kreuzberg
Type names PascalCase (Title, NarrativeText) snake_case (title, narrative_text)
Element IDs Not always present Always present (deterministic hash)
Metadata Basic (page_number, filename) Extended (coordinates, additional fields)
Config key result_format="element_based"