Skip to content

Extraction Basics

flowchart LR
    Input[Input] --> FileOrBytes{File or Bytes?}
    FileOrBytes -->|File Path| FileFunctions[File Functions]
    FileOrBytes -->|In-Memory Data| BytesFunctions[Bytes Functions]

    FileFunctions --> SingleOrBatch{Single or Batch?}
    BytesFunctions --> SingleOrBatch

    SingleOrBatch -->|Single| SingleMode[Single Extraction]
    SingleOrBatch -->|Multiple| BatchMode[Batch Processing]

    SingleMode --> SyncAsync{Sync or Async?}
    BatchMode --> SyncAsync

    SyncAsync -->|Sync| SyncFuncs[extract_file_sync<br/>extract_bytes_sync<br/>batch_extract_files_sync<br/>batch_extract_bytes_sync]
    SyncAsync -->|Async| AsyncFuncs[extract_file<br/>extract_bytes<br/>batch_extract_files<br/>batch_extract_bytes]

    style FileFunctions fill:#87CEEB
    style BytesFunctions fill:#FFD700
    style SyncFuncs fill:#90EE90
    style AsyncFuncs fill:#FFB6C1

Kreuzberg provides 8 core extraction functions organized into 4 categories: file extraction, bytes extraction, batch file extraction, and batch bytes extraction. Each has both sync and async variants.

Extract from Files

Extract text, tables, and metadata from a file on disk.

Synchronous

Python
from kreuzberg import extract_file_sync, ExtractionConfig

config: ExtractionConfig = ExtractionConfig()
result = extract_file_sync("document.pdf", config=config)

content: str = result.content
table_count: int = len(result.tables)
metadata: dict = result.metadata

print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
print(f"Metadata keys: {list(metadata.keys())}")
TypeScript
import { extractFileSync } from '@kreuzberg/node';

const result = extractFileSync('document.pdf');

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;

    println!("{}", result.content);
    println!("Tables: {}", result.tables.len());
    println!("Metadata: {:?}", result.metadata);
    Ok(())
}
Ruby
require 'kreuzberg'

result = Kreuzberg.extract_file_sync('document.pdf')

puts result.content
puts "Tables: #{result.tables.length}"
puts "Metadata: #{result.metadata}"
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;

try {
    ExtractionResult result = Kreuzberg.extractFile("document.pdf");

    System.out.println(result.getContent());
    System.out.println("Tables: " + result.getTables().size());
    System.out.println("Metadata: " + result.getMetadata());
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    fmt.Println(result.Content)
    fmt.Printf("Tables: %d\n", len(result.Tables))
    fmt.Printf("Metadata: %+v\n", result.Metadata)
}
C#
using Kreuzberg;

var result = KreuzbergClient.ExtractFileSync("document.pdf", new ExtractionConfig());

Console.WriteLine(result.Content);
Console.WriteLine($"Tables: {result.Tables.Count}");
Console.WriteLine($"Metadata: {result.Metadata.FormatType}");
WASM
import { initWasm, extractFile } from '@kreuzberg/wasm';

// Initialize WASM module once at app startup
await initWasm();

// Extract from file path (Node.js/Deno/Bun only)
const result = await extractFile('document.pdf');

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

Asynchronous

Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    table_count: int = len(result.tables)

    print(f"Content length: {len(content)} characters")
    print(f"Tables: {table_count}")

asyncio.run(main())
TypeScript
import { extractFile } from '@kreuzberg/node';

const result = await extractFile('document.pdf');
console.log(result.content);
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let result = extract_file("document.pdf", None, &ExtractionConfig::default()).await?;
    println!("{}", result.content);
    Ok(())
}
Ruby
require 'kreuzberg'

# Ruby uses blocking APIs; async variants call into Tokio internally.
result = Kreuzberg.extract_file('document.pdf')
puts result.content
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.nio.file.Path;
import java.util.concurrent.CompletableFuture;

public class Example {
    public static void main(String[] args) {
        CompletableFuture<ExtractionResult> future =
            Kreuzberg.extractFileAsync(Path.of("document.pdf"), null);

        future.thenAccept(result -> {
            System.out.println(result.getContent());
            System.out.println("Tables: " + result.getTables().size());
        }).join();
    }
}
Go
package main

import (
    "context"
    "fmt"
    "log"
    "time"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    fmt.Println(result.Content)
}
C#
using Kreuzberg;

var result = await KreuzbergClient.ExtractFileAsync("document.pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
WASM
import { initWasm, extractFile } from '@kreuzberg/wasm';

await initWasm();

// Extract from file path (async)
const result = await extractFile('document.pdf');

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
flowchart TD
    Start[Choose Sync or Async] --> Context{Already in<br/>Async Context?}

    Context -->|Yes| UseAsync[Use Async Variants]
    Context -->|No| CheckUse{Use Case}

    CheckUse -->|Simple Script| UseSyncSimple[Use Sync<br/>Simpler, same speed]
    CheckUse -->|Multiple Files| CheckConcurrency{Concurrent<br/>Processing?}
    CheckUse -->|Single File| UseSyncSingle[Use Sync<br/>Easier to read]

    CheckConcurrency -->|Yes| UseAsyncConcurrent[Use Async<br/>Better concurrency]
    CheckConcurrency -->|No| UseSyncBatch[Use Sync<br/>Adequate]

    UseAsync --> AsyncFunctions[extract_file<br/>extract_bytes<br/>batch_extract_files<br/>batch_extract_bytes]
    UseSyncSimple --> SyncFunctions[extract_file_sync<br/>extract_bytes_sync<br/>batch_extract_files_sync<br/>batch_extract_bytes_sync]
    UseSyncSingle --> SyncFunctions
    UseAsyncConcurrent --> AsyncFunctions
    UseSyncBatch --> SyncFunctions

    style UseAsync fill:#FFB6C1
    style UseAsyncConcurrent fill:#FFB6C1
    style UseSyncSimple fill:#90EE90
    style UseSyncSingle fill:#90EE90
    style UseSyncBatch fill:#90EE90

TypeScript / Node.js

All TypeScript/Node.js examples in this guide use the @kreuzberg/node package. Import synchronous APIs from the root module and asynchronous helpers from the same namespace. See the TypeScript API Reference for complete type definitions.

basic_extraction.ts
import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';

// Extract a document using synchronous API
const result = extractFileSync('document.pdf', null, new ExtractionConfig());
console.log(result.content);

Ruby

Ruby bindings mirror the same function names (extract_file_sync, extract_bytes, batch_extract_files, etc.) under the Kreuzberg module. Configuration objects live under Kreuzberg::Config. See the Ruby API Reference for details.

basic_extraction.rb
require 'kreuzberg'

# Extract a document with OCR enabled
config = Kreuzberg::Config::Extraction.new(force_ocr: true)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts result.content

When to Use Async

Use async variants when you're already in an async context or processing multiple files concurrently. For simple scripts, sync variants are simpler and just as fast.

Extract from Bytes

Extract from data already loaded in memory.

Synchronous

Python
from kreuzberg import extract_bytes_sync, ExtractionConfig

with open("document.pdf", "rb") as f:
    data = f.read()

result = extract_bytes_sync(
    data,
    mime_type="application/pdf",
    config=ExtractionConfig()
)
print(result.content)
TypeScript
import { extractBytesSync } from '@kreuzberg/node';
import { readFileSync } from 'fs';

const data = readFileSync('document.pdf');
const result = extractBytesSync(data, 'application/pdf');
console.log(result.content);
Rust
use kreuzberg::{extract_bytes_sync, ExtractionConfig};
use std::fs;

fn main() -> kreuzberg::Result<()> {
    let data = fs::read("document.pdf")?;

    let result = extract_bytes_sync(
        &data,
        "application/pdf",
        &ExtractionConfig::default()
    )?;
    println!("{}", result.content);
    Ok(())
}
Ruby
require 'kreuzberg'

data = File.binread('document.pdf')

result = Kreuzberg.extract_bytes_sync(
    data,
    'application/pdf'
)
puts result.content
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

try {
    byte[] data = Files.readAllBytes(Paths.get("document.pdf"));

    ExtractionResult result = Kreuzberg.extractBytes(
        data,
        "application/pdf",
        null
    );
    System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}
Go
package main

import (
    "log"
    "os"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    data, err := os.ReadFile("document.pdf")
    if err != nil {
        log.Fatalf("read file: %v", err)
    }

    result, err := kreuzberg.ExtractBytesSync(data, "application/pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println(result.Content)
}
C#
using Kreuzberg;

var data = await File.ReadAllBytesAsync("document.pdf");
var result = KreuzbergClient.ExtractBytesSync(data, "application/pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
WASM
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf');

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

Asynchronous

Python
import asyncio
from kreuzberg import extract_bytes, ExtractionConfig

async def main():
    with open("document.pdf", "rb") as f:
        data = f.read()

    result = await extract_bytes(
        data,
        mime_type="application/pdf",
        config=ExtractionConfig()
    )
    print(result.content)

asyncio.run(main())
TypeScript
import { extractBytes } from '@kreuzberg/node';
import { readFile } from 'fs/promises';

const data = await readFile('document.pdf');
const result = await extractBytes(data, 'application/pdf');
console.log(result.content);
Rust
use kreuzberg::{extract_bytes, ExtractionConfig};
use tokio::fs;

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let data = fs::read("document.pdf").await?;

    let result = extract_bytes(
        &data,
        "application/pdf",
        &ExtractionConfig::default()
    ).await?;
    println!("{}", result.content);
    Ok(())
}
Ruby
require 'kreuzberg'

data = File.binread('document.pdf')

result = Kreuzberg.extract_bytes(
  data,
  'application/pdf'
)
puts result.content
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.concurrent.CompletableFuture;

try {
    byte[] data = Files.readAllBytes(Paths.get("document.pdf"));

    CompletableFuture<ExtractionResult> future = Kreuzberg.extractBytesAsync(
        data,
        "application/pdf",
        null
    );

    future.thenAccept(result -> System.out.println(result.getContent()))
        .join();
} catch (IOException e) {
    e.printStackTrace();
}
Go
package main

import (
    "context"
    "log"
    "os"
    "time"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    data, err := os.ReadFile("document.pdf")
    if err != nil {
        log.Fatalf("read file: %v", err)
    }

    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    result, err := kreuzberg.ExtractBytes(ctx, data, "application/pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }
    log.Println(result.Content)
}
C#
using Kreuzberg;

var data = await File.ReadAllBytesAsync("document.pdf");
var result = await KreuzbergClient.ExtractBytesAsync(data, "application/pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
WASM
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf');

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

MIME Type Detection

Kreuzberg automatically detects MIME types from file extensions. When extracting from bytes, you must provide the MIME type explicitly.

Batch Processing

Process multiple files concurrently for better performance.

Batch Extract Files

Python
from kreuzberg import batch_extract_files_sync, ExtractionConfig

files: list[str] = ["doc1.pdf", "doc2.docx", "doc3.pptx"]
config: ExtractionConfig = ExtractionConfig()

results = batch_extract_files_sync(files, config=config)

for i, result in enumerate(results):
    char_count: int = len(result.content)
    print(f"File {i + 1}: {char_count} characters")
TypeScript
import { batchExtractFilesSync } from '@kreuzberg/node';

const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
const results = batchExtractFilesSync(files);

results.forEach((result, i) => {
    console.log(`File ${i + 1}: ${result.content.length} characters`);
});
Rust
use kreuzberg::{batch_extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let files = vec!["doc1.pdf", "doc2.docx", "doc3.pptx"];
    let config = ExtractionConfig::default();

    let results = batch_extract_file_sync(&files, None, &config)?;

    for (i, result) in results.iter().enumerate() {
        println!("File {}: {} characters", i + 1, result.content.len());
    }
    Ok(())
}
Ruby
require 'kreuzberg'

files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx']

results = Kreuzberg.batch_extract_files_sync(files)

results.each_with_index do |result, i|
  puts "File #{i + 1}: #{result.content.length} characters"
end
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

try {
    List<String> files = Arrays.asList("doc1.pdf", "doc2.docx", "doc3.pptx");

    List<ExtractionResult> results = Kreuzberg.batchExtractFiles(files, null);

    for (int i = 0; i < results.size(); i++) {
        ExtractionResult result = results.get(i);
        System.out.println("File " + (i + 1) + ": " + result.getContent().length() + " characters");
    }
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    files := []string{"doc1.pdf", "doc2.docx", "doc3.pptx"}

    results, err := kreuzberg.BatchExtractFilesSync(files, nil)
    if err != nil {
        log.Fatalf("batch extract failed: %v", err)
    }

    for i, result := range results {
        if result == nil {
            continue
        }
        fmt.Printf("File %d: %d characters\n", i+1, len(result.Content))
    }
}
C#
using Kreuzberg;

var files = new[] { "doc1.pdf", "doc2.docx", "doc3.pptx" };
var results = KreuzbergClient.BatchExtractFilesSync(files, new ExtractionConfig());

foreach (var result in results)
{
    Console.WriteLine($"Content length: {result.Content.Length}");
}
WASM
import { initWasm, batchExtractFiles } from '@kreuzberg/wasm';

await initWasm();

const files = [
  new File(['content1'], 'doc1.pdf', { type: 'application/pdf' }),
  new File(['content2'], 'doc2.pdf', { type: 'application/pdf' })
];

const results = await batchExtractFiles(files);

results.forEach((result, index) => {
  console.log(`Document ${index + 1}: ${result.content.substring(0, 100)}...`);
});

Batch Extract Bytes

Python
from kreuzberg import batch_extract_bytes_sync, ExtractionConfig

files: list[str] = ["doc1.pdf", "doc2.docx"]
data_list: list[bytes] = []
mime_types: list[str] = []

for file in files:
    with open(file, "rb") as f:
        data_list.append(f.read())
    mime_type: str = "application/pdf" if file.endswith(".pdf") else "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
    mime_types.append(mime_type)

config: ExtractionConfig = ExtractionConfig()
results = batch_extract_bytes_sync(data_list, mime_types, config=config)

for i, result in enumerate(results):
    char_count: int = len(result.content)
    print(f"Document {i + 1}: {char_count} characters")
TypeScript
import { batchExtractBytesSync } from '@kreuzberg/node';
import { readFileSync } from 'fs';

const files = ['doc1.pdf', 'doc2.docx'];
const dataList = files.map(f => readFileSync(f));
const mimeTypes = [
    'application/pdf',
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
];

const results = batchExtractBytesSync(dataList, mimeTypes);

results.forEach((result, i) => {
    console.log(`Document ${i + 1}: ${result.content.length} characters`);
});
Rust
use kreuzberg::{batch_extract_bytes_sync, ExtractionConfig};
use std::fs;

fn main() -> kreuzberg::Result<()> {
    let files = vec!["doc1.pdf", "doc2.docx"];

    let data_list: Vec<Vec<u8>> = files.iter()
        .map(|f| fs::read(f).expect("read file"))
        .collect();

    let mime_types: Vec<&str> = files.iter()
        .map(|f| if f.ends_with(".pdf") {
            "application/pdf"
        } else {
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
        })
        .collect();

    let results = batch_extract_bytes_sync(
        &data_list,
        &mime_types,
        &ExtractionConfig::default()
    )?;

    for (i, result) in results.iter().enumerate() {
        println!("Document {}: {} characters", i + 1, result.content.len());
    }
    Ok(())
}
Ruby
require 'kreuzberg'

files = ['doc1.pdf', 'doc2.docx']

data_list = files.map { |f| File.binread(f) }
mime_types = files.map do |f|
  f.end_with?('.pdf') ? 'application/pdf' :
    'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
end

results = Kreuzberg.batch_extract_bytes_sync(
  data_list,
  mime_types
)

results.each_with_index do |result, i|
  puts "Document #{i + 1}: #{result.content.length} characters"
end
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BytesWithMime;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

try {
    List<String> files = Arrays.asList("doc1.pdf", "doc2.docx");

    List<BytesWithMime> dataList = new ArrayList<>();
    for (String file : files) {
        byte[] data = Files.readAllBytes(Paths.get(file));
        String mimeType = file.endsWith(".pdf") ? "application/pdf" :
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
        dataList.add(new BytesWithMime(data, mimeType));
    }

    List<ExtractionResult> results = Kreuzberg.batchExtractBytes(dataList, null);

    for (int i = 0; i < results.size(); i++) {
        ExtractionResult result = results.get(i);
        System.out.println("Document " + (i + 1) + ": " + result.getContent().length() + " characters");
    }
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}
Go
package main

import (
    "fmt"
    "log"
    "os"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    files := []struct {
        Path string
        MIME string
    }{
        {"doc1.pdf", "application/pdf"},
        {"doc2.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
    }

    items := make([]kreuzberg.BytesWithMime, 0, len(files))
    for _, file := range files {
        data, err := os.ReadFile(file.Path)
        if err != nil {
            log.Fatalf("read %s: %v", file.Path, err)
        }
        items = append(items, kreuzberg.BytesWithMime{
            Data:     data,
            MimeType: file.MIME,
        })
    }

    results, err := kreuzberg.BatchExtractBytesSync(items, nil)
    if err != nil {
        log.Fatalf("batch extract failed: %v", err)
    }

    for i, result := range results {
        if result == nil {
            continue
        }
        fmt.Printf("Document %d: %d characters\n", i+1, len(result.Content))
    }
}
C#
using Kreuzberg;

var documents = new[]
{
    new BytesWithMime(await File.ReadAllBytesAsync("doc1.pdf"), "application/pdf"),
    new BytesWithMime(await File.ReadAllBytesAsync("doc2.docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
};

var results = KreuzbergClient.BatchExtractBytesSync(documents, new ExtractionConfig());

Console.WriteLine($"Processed {results.Count} documents");
WASM
import { initWasm, batchExtractBytes } from '@kreuzberg/wasm';

await initWasm();

const dataList = [
  new Uint8Array(buffer1),
  new Uint8Array(buffer2)
];

const mimeTypes = [
  'application/pdf',
  'application/pdf'
];

const results = await batchExtractBytes(dataList, mimeTypes);

results.forEach((result, index) => {
  console.log(`Document ${index + 1}: ${result.content.substring(0, 100)}...`);
});
flowchart TD
    Start[Multiple Files to Process] --> Method{Processing Method}

    Method -->|Sequential| Sequential[Process One by One]
    Method -->|Batch| Batch[Batch Processing]

    Sequential --> Seq1[File 1: 1.0s]
    Seq1 --> Seq2[File 2: 1.0s]
    Seq2 --> Seq3[File 3: 1.0s]
    Seq3 --> Seq4[File 4: 1.0s]
    Seq4 --> SeqTotal[Total: 4.0s]

    Batch --> Parallel[Automatic Parallelization]
    Parallel --> Par1[File 1: 1.0s]
    Parallel --> Par2[File 2: 1.0s]
    Parallel --> Par3[File 3: 1.0s]
    Parallel --> Par4[File 4: 1.0s]

    Par1 --> ParTotal[Total: ~1.2s]
    Par2 --> ParTotal
    Par3 --> ParTotal
    Par4 --> ParTotal

    SeqTotal --> Result[Sequential: Slow]
    ParTotal --> ResultFast[Batch: 2-5x Faster]

    style Sequential fill:#FFB6C1
    style Batch fill:#90EE90
    style SeqTotal fill:#FF6B6B
    style ResultFast fill:#4CAF50

Performance

Batch processing provides automatic parallelization. For large sets of files, this can be 2-5x faster than processing files sequentially.

Supported Formats

Kreuzberg supports 56 file formats across 8 categories:

Format Extensions Notes
PDF .pdf Native text + OCR for scanned pages
Images .png, .jpg, .jpeg, .tiff, .bmp, .webp Requires OCR backend
Office .docx, .pptx, .xlsx Modern formats via native parsers
Legacy Office .doc, .ppt Requires LibreOffice
Email .eml, .msg Full support including attachments
Web .html, .htm Converted to Markdown with metadata
Text .md, .txt, .xml, .json, .yaml, .toml, .csv Direct extraction
Archives .zip, .tar, .tar.gz, .tar.bz2 Recursive extraction

See the installation guide for optional dependencies (Tesseract, LibreOffice).

Page Tracking and Boundaries

Kreuzberg can track page boundaries and extract per-page content for supported formats.

When Page Tracking is Available

Page tracking is format-specific:

  • PDF: Full byte-accurate page tracking with O(1) lookup performance
  • PPTX: Slide boundary tracking (each slide is a "page")
  • DOCX: Best-effort page break detection using explicit <w:br type="page"/> tags
  • Other formats: No page tracking (boundaries and pages are None/null)

Enabling Page Extraction

To extract per-page content, enable extract_pages:

graph LR
    A[Document] --> B[Extract with PageConfig]
    B --> C[Combined content]
    B --> D[Pages array]
    D --> E[Page 1]
    D --> F[Page 2]
    D --> G[Page N]

See PageConfig documentation for configuration details.

Page Markers

Optionally insert page markers into the combined content string:

Python
config = ExtractionConfig(
    pages=PageConfig(
        insert_page_markers=True,
        marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
    )
)

This adds markers like <!-- PAGE 1 --> at page boundaries in the content field, useful for LLMs to understand document structure.

Relationship with Chunking

When both page tracking and chunking are enabled, chunks automatically include first_page and last_page metadata showing which pages they span.

See Advanced Page Tracking for chunk-to-page mapping examples.

Error Handling

All extraction functions raise exceptions on failure:

Python
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
    KreuzbergError,
    ParsingError,
    OCRError,
    ValidationError,
)

try:
    result = extract_file_sync("document.pdf")
    print(f"Extracted {len(result.content)} characters")
except ParsingError as e:
    print(f"Failed to parse document: {e}")
except OCRError as e:
    print(f"OCR processing failed: {e}")
except KreuzbergError as e:
    print(f"Extraction error: {e}")

try:
    config: ExtractionConfig = ExtractionConfig()
    pdf_bytes: bytes = b"%PDF-1.4\n"
    result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
    print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
    print(f"Invalid configuration: {e}")
except OCRError as e:
    print(f"OCR failed: {e}")
except KreuzbergError as e:
    print(f"Extraction failed: {e}")
TypeScript
import { extractFileSync, KreuzbergError } from '@kreuzberg/node';

try {
    const result = extractFileSync('document.pdf');
    console.log(result.content);
} catch (error) {
    if (error instanceof KreuzbergError) {
        console.error(`Extraction error: ${error.message}`);
    } else {
        throw error;
    }
}
Rust
use kreuzberg::{extract_file_sync, extract_bytes_sync, ExtractionConfig, KreuzbergError};

fn main() -> kreuzberg::Result<()> {
    match extract_file_sync("document.pdf", None, &ExtractionConfig::default()) {
        Ok(result) => {
            println!("Extracted {} characters", result.content.len());
        }
        Err(KreuzbergError::Parsing { message, .. }) => {
            eprintln!("Failed to parse document: {}", message);
        }
        Err(KreuzbergError::Ocr { message, .. }) => {
            eprintln!("OCR processing failed: {}", message);
        }
        Err(KreuzbergError::MissingDependency { message, .. }) => {
            eprintln!("Missing dependency: {}", message);
        }
        Err(e) => {
            eprintln!("Extraction failed: {}", e);
        }
    }

    let pdf_bytes = b"%PDF-1.4\n...";
    match extract_bytes_sync(pdf_bytes, "application/pdf", None, &ExtractionConfig::default()) {
        Ok(result) => {
            println!("Extracted: {}", &result.content[..100.min(result.content.len())]);
            Ok(())
        }
        Err(KreuzbergError::Validation { message, .. }) => {
            eprintln!("Invalid configuration: {}", message);
            Err(KreuzbergError::Validation {
                message: message.clone(),
                source: None,
            })
        }
        Err(KreuzbergError::Ocr { message, .. }) => {
            eprintln!("OCR failed: {}", message);
            Err(KreuzbergError::Ocr {
                message: message.clone(),
                source: None,
            })
        }
        Err(e) => {
            eprintln!("Extraction failed: {}", e);
            Err(e)
        }
    }
}
Ruby
require 'kreuzberg'

begin
  result = Kreuzberg.extract_file_sync('document.pdf')
  puts result.content
rescue Kreuzberg::ValidationError => e
  puts "Invalid configuration: #{e.message}"
rescue Kreuzberg::ParsingError => e
  puts "Failed to parse document: #{e.message}"
rescue Kreuzberg::OCRError => e
  puts "OCR processing failed: #{e.message}"
rescue Kreuzberg::MissingDependencyError => e
  puts "Missing dependency: #{e.message}"
rescue Kreuzberg::Error => e
  puts "Extraction error: #{e.message}"
rescue StandardError => e
  puts "System error: #{e.message}"
end
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;

try {
    ExtractionResult result = Kreuzberg.extractFile("document.pdf");
    System.out.println("Extracted: " + result.getContent()
        .substring(0, Math.min(100, result.getContent().length())));
} catch (IOException e) {
    System.err.println("File not found: " + e.getMessage());
} catch (KreuzbergException e) {
    System.err.println("Extraction failed: " + e.getMessage());
}

try {
    byte[] pdfBytes = new byte[] { };
    ExtractionResult result = Kreuzberg.extractBytes(pdfBytes, "application/pdf", null);
    System.out.println("Extracted " + result.getContent().length() + " characters");
} catch (KreuzbergException e) {
    System.err.println("Extraction failed: " + e.getMessage());
}
Go
package main

import (
    "errors"
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        switch {
        case errors.As(err, new(*kreuzberg.ValidationError)):
            log.Fatalf("invalid configuration: %v", err)
        case errors.As(err, new(*kreuzberg.ParsingError)):
            log.Fatalf("failed to parse document: %v", err)
        case errors.As(err, new(*kreuzberg.OCRError)):
            log.Fatalf("OCR processing failed: %v", err)
        case errors.As(err, new(*kreuzberg.MissingDependencyError)):
            log.Fatalf("missing dependency: %v", err)
        default:
            log.Fatalf("extraction error: %v", err)
        }
    }

    fmt.Println(result.Content)
}
C#
using Kreuzberg;

try
{
    var result = KreuzbergClient.ExtractFileSync("missing.pdf");
    Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
    Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
    Console.Error.WriteLine($"IO error: {ex.Message}");
    throw;
}
catch (KreuzbergException ex)
{
    Console.Error.WriteLine($"Extraction failed: {ex.Message}");
    throw;
}
WASM
import { initWasm, extractBytes } from '@kreuzberg/wasm';

try {
    await initWasm();
    const bytes = new Uint8Array(buffer);
    const result = await extractBytes(bytes, 'application/pdf');
    console.log(result.content);
} catch (error) {
    if (error instanceof Error) {
        console.error(`Extraction error: ${error.message}`);
    } else {
        throw error;
    }
}

System Errors

OSError (Python), IOException (Rust), and system-level errors always bubble up to users. These indicate real system problems that need to be addressed (permissions, disk space, etc.).

Next Steps