Element-Based Output v4.1.0¶
Segments a document into a flat array of typed elements — titles, paragraphs, tables, list items, code blocks, images, and more. Each element carries a page number and, for text elements in PDFs when hierarchy extraction is enabled, bounding box coordinates.
Use element-based output for RAG chunking, semantic search, or Unstructured.io-compatible pipelines. For hierarchical tree structure, use document structure. For plain text, use the default unified output.
Enable¶
from kreuzberg import extract_file_sync, ExtractionConfig
# Configure element-based output
config = ExtractionConfig(result_format="element_based")
# Extract document
result = extract_file_sync("document.pdf", config=config)
# Access elements
for element in result.elements:
print(f"Type: {element.element_type}")
print(f"Text: {element.text[:100]}")
if element.metadata.page_number:
print(f"Page: {element.metadata.page_number}")
if element.metadata.coordinates:
coords = element.metadata.coordinates
print(f"Coords: ({coords.left}, {coords.top}) - ({coords.right}, {coords.bottom})")
print("---")
# Filter by element type
titles = [e for e in result.elements if e.element_type == "title"]
for title in titles:
level = title.metadata.additional.get("level", "unknown")
print(f"[{level}] {title.text}")
import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
// Configure element-based output
const config: ExtractionConfig = {
outputFormat: "element_based"
};
// Extract document
const result = extractFileSync("document.pdf", null, config);
// Access elements
for (const element of result.elements) {
console.log(`Type: ${element.elementType}`);
console.log(`Text: ${element.text.slice(0, 100)}`);
if (element.metadata.pageNumber) {
console.log(`Page: ${element.metadata.pageNumber}`);
}
if (element.metadata.coordinates) {
const coords = element.metadata.coordinates;
console.log(`Coords: (${coords.left}, ${coords.top}) - (${coords.right}, ${coords.bottom})`);
}
console.log("---");
}
// Filter by element type
const titles = result.elements.filter(e => e.elementType === "title");
for (const title of titles) {
const level = title.metadata.additional?.level || "unknown";
console.log(`[${level}] ${title.text}`);
}
use kreuzberg::{extract_file_sync, ExtractionConfig};
use kreuzberg::types::OutputFormat as ResultFormat;
fn main() -> kreuzberg::Result<()> {
// Configure element-based output (result_format controls Unified vs ElementBased)
let config = ExtractionConfig {
result_format: ResultFormat::ElementBased,
..Default::default()
};
// Extract document
let result = extract_file_sync("document.pdf", None, &config)?;
// Access elements
if let Some(elements) = result.elements {
for element in &elements {
println!("Type: {:?}", element.element_type);
println!("Text: {}", &element.text[..100.min(element.text.len())]);
if let Some(page) = element.metadata.page_number {
println!("Page: {}", page);
}
if let Some(coords) = &element.metadata.coordinates {
println!("Coords: ({}, {}) - ({}, {})",
coords.x0, coords.y0, coords.x1, coords.y1);
}
println!("---");
}
// Filter by element type
let titles: Vec<_> = elements.iter()
.filter(|e| matches!(e.element_type, kreuzberg::types::ElementType::Title))
.collect();
for title in titles {
let level = title.metadata.additional.get("level")
.map(|v| v.as_ref())
.unwrap_or("unknown");
println!("[{}] {}", level, title.text);
}
}
Ok(())
}
package main
import (
"fmt"
"kreuzberg"
)
func main() {
// Configure element-based output
config := &kreuzberg.ExtractionConfig{
OutputFormat: "element_based",
}
// Extract document
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
panic(err)
}
// Access elements
for _, element := range result.Elements {
fmt.Printf("Type: %s\n", element.ElementType)
text := element.Text
if len(text) > 100 {
text = text[:100]
}
fmt.Printf("Text: %s\n", text)
if element.Metadata.PageNumber != nil {
fmt.Printf("Page: %d\n", *element.Metadata.PageNumber)
}
if element.Metadata.Coordinates != nil {
coords := element.Metadata.Coordinates
fmt.Printf("Coords: (%f, %f) - (%f, %f)\n",
coords.Left, coords.Top, coords.Right, coords.Bottom)
}
fmt.Println("---")
}
// Filter by element type
var titles []kreuzberg.Element
for _, element := range result.Elements {
if element.ElementType == "title" {
titles = append(titles, element)
}
}
for _, title := range titles {
level, ok := title.Metadata.Additional["level"].(string)
if !ok {
level = "unknown"
}
fmt.Printf("[%s] %s\n", level, title.Text)
}
}
require 'kreuzberg'
# Configure element-based output
config = Kreuzberg::Config::Extraction.new(output_format: 'element_based')
# Extract document
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
# Access elements
result.elements.each do |element|
puts "Type: #{element.element_type}"
puts "Text: #{element.text[0...100]}"
puts "Page: #{element.metadata.page_number}" if element.metadata.page_number
if element.metadata.coordinates
coords = element.metadata.coordinates
puts "Coords: (#{coords.left}, #{coords.top}) - (#{coords.right}, #{coords.bottom})"
end
puts "---"
end
# Filter by element type
titles = result.elements.select { |e| e.element_type == 'title' }
titles.each do |title|
level = title.metadata.additional['level'] || 'unknown'
puts "[#{level}] #{title.text}"
end
library(kreuzberg)
config <- extraction_config(
result_format = "element_based",
output_format = "markdown"
)
file_path <- "document.pdf"
result <- extract_file_sync(file_path, config = config)
cat(sprintf("Total elements: %d\n\n", length(result$elements)))
for (i in seq_along(result$elements)) {
element <- result$elements[[i]]
cat(sprintf("Element %d:\n", i))
cat(sprintf(" Type: %s\n", element$element_type))
cat(sprintf(" Content: %s\n\n", substr(element$content, 1, 100)))
}
<?php
use Kreuzberg\ExtractionConfig;
use Kreuzberg\Kreuzberg;
// Configure element-based output
$config = new ExtractionConfig();
$config->setOutputFormat('element_based');
// Extract document
$result = Kreuzberg::extractFileSync('document.pdf', $config);
// Access elements
foreach ($result->getElements() as $element) {
echo "Type: " . $element->getElementType() . "\n";
echo "Text: " . substr($element->getText(), 0, 100) . "\n";
if ($element->getMetadata()->getPageNumber()) {
echo "Page: " . $element->getMetadata()->getPageNumber() . "\n";
}
if ($element->getMetadata()->getCoordinates()) {
$coords = $element->getMetadata()->getCoordinates();
echo sprintf("Coords: (%s, %s) - (%s, %s)\n",
$coords->getLeft(), $coords->getTop(),
$coords->getRight(), $coords->getBottom());
}
echo "---\n";
}
// Filter by element type
$titles = array_filter($result->getElements(), function($e) {
return $e->getElementType() === 'title';
});
foreach ($titles as $title) {
$level = $title->getMetadata()->getAdditional()['level'] ?? 'unknown';
echo "[{$level}] {$title->getText()}\n";
}
?>
Elements are in result.elements. Each element has element_id, element_type, text, and metadata.
Element Types¶
element_type |
Description | Key additional fields |
|---|---|---|
title |
Main title or top-level heading | level (h1–h6), font_size, font_name |
heading |
Section/subsection heading | level (h1–h6) |
narrative_text |
Body paragraph | — |
list_item |
Bullet, numbered, or indented item | list_type, list_marker, indent_level |
table |
Tabular data | row_count, column_count, format |
image |
Embedded image | format, width, height, alt_text |
code_block |
Code snippet | language, line_count |
block_quote |
Quoted text | — |
header |
Recurring page header | position |
footer |
Recurring page footer | position |
page_break |
Page boundary marker | next_page |
Metadata¶
Every element's metadata contains:
| Field | Type | Description |
|---|---|---|
page_number |
int \| None |
1-indexed page number (PDF, DOCX, PPTX) |
filename |
str \| None |
Source filename |
coordinates |
BoundingBox \| None |
x0, y0, x1, y1 in PDF points. Only populated for text elements when pdf_options.hierarchy is enabled with include_bbox=True. Table and image elements do not carry coordinates. |
element_index |
int |
Zero-based position in the elements array |
additional |
dict[str, str] |
Element-type-specific fields (see table above) |
PDF coordinates use bottom-left origin in points (1/72 inch).
Example Output¶
{
"element_id": "elem-a3f2b1c4",
"element_type": "title",
"text": "Introduction to Machine Learning",
"metadata": {
"page_number": 1,
"element_index": 0,
"coordinates": { "x0": 72.0, "y0": 700.0, "x1": 540.0, "y1": 730.0 },
"additional": { "level": "h1", "font_size": "24" }
}
}
Filtering Elements¶
config = ExtractionConfig(result_format="element_based")
result = extract_file_sync("document.pdf", config=config)
titles = [e for e in result.elements if e.element_type == "title"]
tables = [e for e in result.elements if e.element_type == "table"]
for title in titles:
level = title.metadata.additional.get("level", "h1")
print(f"[{level}] {title.text}")
Unstructured.io Compatibility¶
Element-based output follows Unstructured.io's element array structure. Key differences when migrating:
| Aspect | Unstructured.io | Kreuzberg |
|---|---|---|
| Type names | PascalCase (Title, NarrativeText) |
snake_case (title, narrative_text) |
| Element IDs | Not always present | Always present (deterministic hash) |
| Metadata | Basic (page_number, filename) |
Extended (coordinates, additional fields) |
| Config key | — | result_format="element_based" |