Quick Start¶

Get up and running with Kreuzberg in minutes.

Choosing Your TypeScript Package

Kreuzberg provides two TypeScript packages for different runtimes:

@kreuzberg/node – Use for Node.js servers and CLI tools (native performance, 100% speed)
@kreuzberg/wasm – Use for browsers, Cloudflare Workers, Deno, Bun, and serverless (60-80% speed, cross-platform)

The examples below show both. Pick the one matching your runtime. See Platform Overview for detailed guidance.

Basic Extraction¶

Extract text from any supported document format:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var result = KreuzbergClient.ExtractFileSync("document.pdf", new ExtractionConfig());

Console.WriteLine(result.Content);
Console.WriteLine($"Tables: {result.Tables.Count}");
Console.WriteLine($"Metadata: {result.Metadata.FormatType}");

Go

package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    fmt.Println(result.Content)
    fmt.Printf("Tables: %d\n", len(result.Tables))
    fmt.Printf("Metadata: %+v\n", result.Metadata)
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;

try {
    ExtractionResult result = Kreuzberg.extractFile("document.pdf");

    System.out.println(result.getContent());
    System.out.println("Tables: " + result.getTables().size());
    System.out.println("Metadata: " + result.getMetadata());
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}

Python

from kreuzberg import extract_file_sync, ExtractionConfig

config: ExtractionConfig = ExtractionConfig()
result = extract_file_sync("document.pdf", config=config)

content: str = result.content
table_count: int = len(result.tables)
metadata: dict = result.metadata

print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
print(f"Metadata keys: {list(metadata.keys())}")

Ruby

require 'kreuzberg'

result = Kreuzberg.extract_file_sync('document.pdf')

puts result.content
puts "Tables: #{result.tables.length}"
puts "Metadata: #{result.metadata}"

R

library(kreuzberg)

# Extract a file synchronously
result <- extract_file_sync("path/to/document.pdf")

# Access extraction results
cat("Content length:", nchar(result$content), "\n")
cat("Mime type:", result$mime_type, "\n")
cat("Pages:", page_count(result), "\n")
cat("Quality score:", result$quality_score, "\n")

Rust

use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;

    println!("{}", result.content);
    println!("Tables: {}", result.tables.len());
    println!("Metadata: {:?}", result.metadata);
    Ok(())
}

Elixir

{:ok, result} = Kreuzberg.extract_file("document.pdf")

content = result.content
table_count = length(result.tables)
metadata = result.metadata

IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")

TypeScript

import { extractFileSync } from '@kreuzberg/node';

const result = extractFileSync('document.pdf');

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    const result = await extractFromFile(file);
    console.log(result.content);
    console.log(`Tables: ${result.tables.length}`);
    console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}

Bash

# Extract to stdout
kreuzberg extract document.pdf

# Save to file using shell redirection
kreuzberg extract document.pdf > output.txt

# Extract with JSON format (includes metadata)
kreuzberg extract document.pdf --format json

Async Extraction¶

For better performance with I/O-bound operations:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var result = await KreuzbergClient.ExtractFileAsync("document.pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);

Go

package main

import (
    "context"
    "fmt"
    "log"
    "time"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
    defer cancel()

    result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    fmt.Println(result.Content)
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.nio.file.Path;
import java.util.concurrent.CompletableFuture;

public class Example {
    public static void main(String[] args) {
        CompletableFuture<ExtractionResult> future =
            Kreuzberg.extractFileAsync(Path.of("document.pdf"), null);

        future.thenAccept(result -> {
            System.out.println(result.getContent());
            System.out.println("Tables: " + result.getTables().size());
        }).join();
    }
}

Python

import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    table_count: int = len(result.tables)

    print(f"Content length: {len(content)} characters")
    print(f"Tables: {table_count}")

asyncio.run(main())

Ruby

require 'kreuzberg'

# Ruby uses blocking APIs; async variants call into Tokio internally.
result = Kreuzberg.extract_file('document.pdf')
puts result.content

R

library(kreuzberg)

# Note: extract_file() blocks in R despite being async
result <- extract_file("path/to/document.docx")

# Access extraction results
cat("Extracted", length(result$elements), "elements\n")
cat("Detected language:", result$detected_language, "\n")
cat("Tables found:", length(result$tables), "\n")

if (!is.null(result$keywords)) {
  cat("Keywords:", paste(result$keywords, collapse = ", "), "\n")
}

Rust

use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let result = extract_file("document.pdf", None, &ExtractionConfig::default()).await?;
    println!("{}", result.content);
    Ok(())
}

Elixir

task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)

content = result.content
table_count = length(result.tables)
metadata = result.metadata

IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")

TypeScript

import { extractFile } from '@kreuzberg/node';

const result = await extractFile('document.pdf');
console.log(result.content);

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    const result = await extractFromFile(file);
    const content = result.content;
    const tableCount = result.tables.length;

    console.log(`Content length: ${content.length} characters`);
    console.log(`Tables: ${tableCount}`);
}

Not Applicable

Async extraction is an API-level feature. The CLI operates synchronously. Use language-specific bindings (Python, TypeScript, Rust, WASM) for async operations.

OCR Extraction¶

Extract text from images and scanned documents:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var config = new ExtractionConfig
{
    ForceOcr = true,
    Ocr = new OcrConfig
    {
        Backend = "tesseract",
        Language = "eng",
    },
};

var result = KreuzbergClient.ExtractFileSync("scanned.pdf", config);
Console.WriteLine(result.Content);
Console.WriteLine(result.DetectedLanguages);

Go

package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    lang := "eng"
    cfg := &kreuzberg.ExtractionConfig{
        OCR: &kreuzberg.OCRConfig{
            Backend:  "tesseract",
            Language: &lang,
        },
    }

    result, err := kreuzberg.ExtractFileSync("scanned.pdf", cfg)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }
    log.Println(len(result.Content))
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionConfig config = ExtractionConfig.builder()
                .ocr(OcrConfig.builder()
                    .backend("tesseract")
                    .language("eng")
                    .build())
                .build();

            ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
            System.out.println(result.getContent());
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}

Python

from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig

config: ExtractionConfig = ExtractionConfig(
    ocr=OcrConfig(backend="tesseract", language="eng")
)

result = extract_file_sync("scanned.pdf", config=config)

content: str = result.content
preview: str = content[:100]
total_length: int = len(content)

print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")

Ruby

require 'kreuzberg'

ocr_config = Kreuzberg::Config::OCR.new(
  backend: 'tesseract',
  language: 'eng'
)

config = Kreuzberg::Config::Extraction.new(ocr: ocr_config)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
puts result.content

R

library(kreuzberg)

# Configure Tesseract OCR
ocr <- ocr_config(backend = "tesseract", language = "eng", dpi = 300L)
config <- extraction_config(force_ocr = TRUE, ocr = ocr)

# Extract text from a scanned image
result <- extract_file_sync("scan.png", config = config)

cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat(sprintf("Quality score: %s\n", result$quality_score))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))

Rust

use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file_sync("scanned.pdf", None, &config)?;
    println!("{}", result.content);
    Ok(())
}

Elixir

alias Kreuzberg.ExtractionConfig

config = %ExtractionConfig{
  ocr: %{"enabled" => true, "backend" => "tesseract"}
}

{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)

content = result.content
IO.puts("OCR Extracted content:")
IO.puts(content)
IO.puts("Metadata: #{inspect(result.metadata)}")

TypeScript

import { extractFileSync } from '@kreuzberg/node';

const config = {
    ocr: {
        backend: 'tesseract',
        language: 'eng',
    },
};

const result = extractFileSync('scanned.pdf', null, config);
console.log(result.content);

WASM (Browser)

import { enableOcr, extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();
await enableOcr();

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    const result = await extractFromFile(file, file.type, {
        ocr: {
            backend: 'kreuzberg-tesseract',
            language: 'eng',
        },
    });
    console.log(result.content);
}

WASM (Node.js / Deno / Bun)

import { enableOcr, extractFile, initWasm } from '@kreuzberg/wasm';

await initWasm();
await enableOcr(); // Uses native kreuzberg-tesseract backend

const result = await extractFile('./scanned_document.png', 'image/png', {
    ocr: {
        backend: 'kreuzberg-tesseract',
        language: 'eng',
    },
});
console.log(result.content);

Bash

kreuzberg extract scanned.pdf --ocr true

Batch Processing¶

Process multiple files concurrently:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var files = new[] { "doc1.pdf", "doc2.docx", "doc3.pptx" };
var results = KreuzbergClient.BatchExtractFilesSync(files, new ExtractionConfig());

foreach (var result in results)
{
    Console.WriteLine($"Content length: {result.Content.Length}");
}

Go

package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    files := []string{"doc1.pdf", "doc2.docx", "doc3.pptx"}

    results, err := kreuzberg.BatchExtractFilesSync(files, nil)
    if err != nil {
        log.Fatalf("batch extract failed: %v", err)
    }

    for i, result := range results {
        if result == nil {
            continue
        }
        fmt.Printf("File %d: %d characters\n", i+1, len(result.Content))
    }
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

try {
    List<String> files = Arrays.asList("doc1.pdf", "doc2.docx", "doc3.pptx");

    List<ExtractionResult> results = Kreuzberg.batchExtractFiles(files, null);

    for (int i = 0; i < results.size(); i++) {
        ExtractionResult result = results.get(i);
        System.out.println("File " + (i + 1) + ": " + result.getContent().length() + " characters");
    }
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}

Python

from kreuzberg import batch_extract_files_sync, ExtractionConfig

files: list[str] = ["doc1.pdf", "doc2.docx", "doc3.pptx"]
config: ExtractionConfig = ExtractionConfig()

results = batch_extract_files_sync(files, config=config)

for i, result in enumerate(results):
    char_count: int = len(result.content)
    print(f"File {i + 1}: {char_count} characters")

Ruby

require 'kreuzberg'

files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx']

results = Kreuzberg.batch_extract_files_sync(files)

results.each_with_index do |result, i|
  puts "File #{i + 1}: #{result.content.length} characters"
end

R

library(kreuzberg)

# Define file paths to extract
file_paths <- c(
  "documents/report.pdf",
  "documents/summary.docx",
  "documents/data.xlsx"
)

# Batch extract files
results <- batch_extract_files_sync(file_paths)

# Process results
for (i in seq_along(results)) {
  result <- results[[i]]
  cat(sprintf("File %d: %s\n", i, file_paths[i]))
  cat(sprintf("  Pages: %d\n", page_count(result)))
  cat(sprintf("  Elements: %d\n", length(result$elements)))
}

Rust

use kreuzberg::{batch_extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let files = vec!["doc1.pdf", "doc2.docx", "doc3.pptx"];
    let config = ExtractionConfig::default();

    let results = batch_extract_file_sync(files, &config)?;

    for (i, result) in results.iter().enumerate() {
        println!("File {}: {} characters", i + 1, result.content.len());
    }
    Ok(())
}

Elixir

file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]

{:ok, results} = Kreuzberg.batch_extract_files(file_paths)

Enum.each(results, fn result ->
  IO.puts("File: #{result.mime_type}")
  IO.puts("Content length: #{byte_size(result.content)} characters")
  IO.puts("Tables: #{length(result.tables)}")
  IO.puts("---")
end)

IO.puts("Total files processed: #{length(results)}")

TypeScript

import { batchExtractFilesSync } from '@kreuzberg/node';

const files = ['doc1.pdf', 'doc2.docx', 'doc3.pptx'];
const results = batchExtractFilesSync(files);

results.forEach((result, i) => {
    console.log(`File ${i + 1}: ${result.content.length} characters`);
});

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const fileInputs = document.getElementById('files') as HTMLInputElement;
const files = Array.from(fileInputs.files || []);

const results = await Promise.all(
    files.map((file) => extractFromFile(file))
);

results.forEach((result, i) => {
    console.log(`File ${i + 1}: ${result.content.length} characters`);
});

Bash

# Process multiple files
kreuzberg extract doc1.pdf doc2.docx doc3.pptx

# Use glob patterns
kreuzberg extract documents/**/*.pdf

Extract from Bytes¶

When you already have file content in memory:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var data = await File.ReadAllBytesAsync("document.pdf");
var result = KreuzbergClient.ExtractBytesSync(data, "application/pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);

Go

package main

import (
    "log"
    "os"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    data, err := os.ReadFile("document.pdf")
    if err != nil {
        log.Fatalf("read file: %v", err)
    }

    result, err := kreuzberg.ExtractBytesSync(data, "application/pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println(result.Content)
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;

try {
    byte[] data = Files.readAllBytes(Paths.get("document.pdf"));

    ExtractionResult result = Kreuzberg.extractBytes(
        data,
        "application/pdf",
        null
    );
    System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
    e.printStackTrace();
}

Python

from kreuzberg import extract_bytes_sync, ExtractionConfig

with open("document.pdf", "rb") as f:
    data = f.read()

result = extract_bytes_sync(
    data,
    mime_type="application/pdf",
    config=ExtractionConfig()
)
print(result.content)

Ruby

require 'kreuzberg'

data = File.binread('document.pdf')

result = Kreuzberg.extract_bytes_sync(
    data,
    'application/pdf'
)
puts result.content

R

library(kreuzberg)

# Read file as binary data
file_data <- readBin("path/to/document.pdf", what = "raw", n = file.size("path/to/document.pdf"))

# Extract from bytes with explicit mime type
result <- extract_bytes_sync(file_data, mime_type = "application/pdf")

# Access extraction results
cat("Content preview:", substr(result$content, 1, 100), "\n")
cat("Mime type:", result$mime_type, "\n")
cat("Pages:", page_count(result), "\n")

Rust

use kreuzberg::{extract_bytes_sync, ExtractionConfig};
use std::fs;

fn main() -> kreuzberg::Result<()> {
    let data = fs::read("document.pdf")?;

    let result = extract_bytes_sync(
        &data,
        "application/pdf",
        &ExtractionConfig::default()
    )?;
    println!("{}", result.content);
    Ok(())
}

Elixir

# Read file into memory
{:ok, file_content} = File.read("document.pdf")

# Extract from bytes/binary data
{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")

content = result.content
IO.puts("Extracted content:")
IO.puts(content)
IO.puts("MIME type: #{result.mime_type}")
IO.puts("Tables found: #{length(result.tables)}")

TypeScript

import { extractBytesSync } from '@kreuzberg/node';
import { readFileSync } from 'fs';

const data = readFileSync('document.pdf');
const result = extractBytesSync(data, 'application/pdf');
console.log(result.content);

WASM

import { extractBytes, initWasm } from '@kreuzberg/wasm';

await initWasm();

const response = await fetch('document.pdf');
const buffer = await response.arrayBuffer();
const data = new Uint8Array(buffer);

const result = await extractBytes(data, 'application/pdf');
console.log(result.content);

Not Applicable

The CLI operates on files from disk. For in-memory data processing, use language-specific bindings.

However, you can use CLI with pipes and temporary files:

Terminal

# Create temporary file from stdin and extract
cat data.pdf | kreuzberg extract /dev/stdin

# Or process piped content
curl https://example.com/document.pdf | \
  kreuzberg extract /dev/stdin

Advanced Configuration¶

Customize extraction behavior:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var config = new ExtractionConfig
{
    Ocr = new OcrConfig { Backend = "tesseract", Language = "eng+deu" },
    Chunking = new ChunkingConfig { MaxChars = 1000, MaxOverlap = 100 },
    TokenReduction = new TokenReductionConfig { Enabled = true },
    LanguageDetection = new LanguageDetectionConfig
    {
        Enabled = true,
        DetectMultiple = true
    },
    UseCache = true,
    EnableQualityProcessing = true
};

var result = KreuzbergClient.ExtractFileSync("document.pdf", config);

foreach (var chunk in result.Chunks)
{
    Console.WriteLine($"Chunk: {chunk.Content[..Math.Min(100, chunk.Content.Length)]}");
}

if (result.DetectedLanguages?.Count > 0)
{
    Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}

Go

package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    lang := "eng+deu" // Multiple languages
    chunkSize := 1000
    chunkOverlap := 100
    useCache := true
    enableQuality := true
    detectMultiple := true

    config := &kreuzberg.ExtractionConfig{
        OCR: &kreuzberg.OCRConfig{
            Backend:  "tesseract",
            Language: &lang,
        },
        Chunking: &kreuzberg.ChunkingConfig{
            ChunkSize:    &chunkSize,
            ChunkOverlap: &chunkOverlap,
        },
        LanguageDetection: &kreuzberg.LanguageDetectionConfig{
            Enabled:        &useCache,
            DetectMultiple: &detectMultiple,
        },
        UseCache:                &useCache,
        EnableQualityProcessing: &enableQuality,
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    // Access chunks
    if len(result.Chunks) > 0 {
        snippet := result.Chunks[0].Content
        if len(snippet) > 100 {
            snippet = snippet[:100]
        }
        fmt.Printf("First chunk: %s...\n", snippet)
    }

    // Access detected languages
    if len(result.DetectedLanguages) > 0 {
        fmt.Printf("Languages: %v\n", result.DetectedLanguages)
    }
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.*;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionConfig config = ExtractionConfig.builder()
                .ocr(OcrConfig.builder()
                    .backend("tesseract")
                    .language("eng+deu")
                    .build())
                .chunking(ChunkingConfig.builder()
                    .maxChars(1000)
                    .maxOverlap(100)
                    .build())
                .tokenReduction(TokenReductionConfig.builder()
                    .mode("moderate")
                    .preserveImportantWords(true)
                    .build())
                .languageDetection(LanguageDetectionConfig.builder()
                    .enabled(true)
                    .build())
                .useCache(true)
                .enableQualityProcessing(true)
                .build();

            ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);

            if (!result.getDetectedLanguages().isEmpty()) {
                System.out.println("Languages: " + result.getDetectedLanguages());
            }
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}

Python

from kreuzberg import (
    extract_file_sync,
    ExtractionConfig,
    OcrConfig,
    ChunkingConfig,
    TokenReductionConfig,
    LanguageDetectionConfig,
)

config = ExtractionConfig(
    ocr=OcrConfig(backend="tesseract", language="eng+deu"),
    chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
    token_reduction=TokenReductionConfig(mode="light"),
    language_detection=LanguageDetectionConfig(
        enabled=True, detect_multiple=True
    ),
    use_cache=True,
    enable_quality_processing=True,
)

result = extract_file_sync("document.pdf", config=config)

for chunk in result.chunks or []:
    print(f"Chunk: {chunk.content[:100]}")

if result.detected_languages:
    print(f"Languages: {result.detected_languages}")

Ruby

require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  ocr: Kreuzberg::Config::OCR.new(
    backend: 'tesseract',
    language: 'eng+deu'
  ),
  chunking: Kreuzberg::Config::Chunking.new(
    max_characters: 1000,
    overlap: 100
  ),
  language_detection: Kreuzberg::Config::LanguageDetection.new,
  use_cache: true,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)

result.chunks&.each { |chunk| puts chunk[0..100] }
puts "Languages: #{result.detected_languages.inspect}"

R

library(kreuzberg)

ocr_cfg <- ocr_config(backend = "tesseract", language = "eng", dpi = 300L)
chunking_cfg <- chunking_config(max_characters = 1500L, overlap = 300L)

config <- extraction_config(
  ocr = ocr_cfg,
  chunking = chunking_cfg,
  output_format = "markdown",
  include_document_structure = TRUE,
  force_ocr = TRUE
)

result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Format: %s\n", result$mime_type))
cat(sprintf("Chunks: %d\n", length(result$chunks)))
cat(sprintf("Content preview: %.50s...\n", result$content))

Rust

use kreuzberg::{
    extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng+deu".to_string(),
            ..Default::default()
        }),

        chunking: Some(ChunkingConfig {
            max_characters: 1000,
            overlap: 100,
            ..Default::default()
        }),

        language_detection: Some(LanguageDetectionConfig {
            enabled: true,
            detect_multiple: true,
            ..Default::default()
        }),

        use_cache: true,
        enable_quality_processing: true,

        ..Default::default()
    };

    let result = extract_file_sync("document.pdf", None, &config)?;

    if let Some(chunks) = result.chunks {
        for chunk in chunks {
            let preview: String = chunk.content.chars().take(100).collect();
            println!("Chunk: {}...", preview);
        }
    }

    if let Some(languages) = result.detected_languages {
        println!("Languages: {:?}", languages);
    }
    Ok(())
}

Elixir

alias Kreuzberg.ExtractionConfig

config = %ExtractionConfig{
  ocr: %{"enabled" => true, "backend" => "tesseract"},
  chunking: %{"max_characters" => 1000, "overlap" => 100},
  language_detection: %{"enabled" => true},
  use_cache: true,
  force_ocr: false
}

{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)

IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")

TypeScript

import { extractFileSync } from '@kreuzberg/node';

const config = {
    ocr: {
        backend: 'tesseract',
        language: 'eng+deu',
    },
    chunking: {
        maxChars: 1000,
        maxOverlap: 100,
    },
    tokenReduction: {
        mode: 'aggressive',
    },
    languageDetection: {
        enabled: true,
        detectMultiple: true,
    },
    useCache: true,
    enableQualityProcessing: true,
};

const result = extractFileSync('document.pdf', null, config);

if (result.chunks) {
    for (const chunk of result.chunks) {
        console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
    }
}

if (result.detectedLanguages) {
    console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
}

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const config = {
    ocr: {
        backend: 'tesseract-wasm',
        language: 'eng',
    },
    chunking: {
        maxChars: 1000,
        chunkOverlap: 100,
    },
    enable_language_detection: true,
    enable_quality: true,
};

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    const result = await extractFromFile(file, file.type, config);

    if (result.chunks) {
        for (const chunk of result.chunks) {
            console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
        }
    }

    if (result.detectedLanguages) {
        console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
    }
}

Configure extraction behavior via command-line flags or config files:

Terminal

# Using command-line flags
kreuzberg extract document.pdf \
  --ocr \
  --chunk --chunk-size 1000 --chunk-overlap 100 \
  --detect-language \
  --quality

# Using config file
kreuzberg extract document.pdf --config kreuzberg.toml

kreuzberg.toml:

kreuzberg.toml

[ocr]
backend = "tesseract"
language = "eng"

[chunking]
max_characters = 1000
overlap = 100

[language_detection]
enabled = true
detect_multiple = true

enable_quality_processing = true
use_cache = true

kreuzberg.yaml:

kreuzberg.yaml

ocr:
  backend: tesseract
  language: eng

chunking:
  max_characters: 1000
  overlap: 100

language_detection:
  enabled: true
  detect_multiple: true

enable_quality_processing: true
use_cache: true

Working with Metadata¶

Access format-specific metadata from extracted documents:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var config = new ExtractionConfig
{
    PdfOptions = new PdfConfig { ExtractMetadata = true }
};

var result = KreuzbergClient.ExtractFileSync("document.pdf", config);

if (result.Metadata?.Format.Pdf != null)
{
    var pdfMeta = result.Metadata.Format.Pdf;
    Console.WriteLine($"Pages: {pdfMeta.PageCount}");
    Console.WriteLine($"Author: {pdfMeta.Author}");
    Console.WriteLine($"Title: {pdfMeta.Title}");
}

var htmlResult = KreuzbergClient.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
    var htmlMeta = htmlResult.Metadata.Format.Html;
    Console.WriteLine($"Title: {htmlMeta.Title}");
    Console.WriteLine($"Description: {htmlMeta.Description}");

    // Access keywords as array
    if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
    {
        Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
    }

    // Access canonical URL (renamed from canonical)
    if (htmlMeta.CanonicalUrl != null)
    {
        Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
    }

    // Access Open Graph fields from dictionary
    if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
    {
        if (htmlMeta.OpenGraph.ContainsKey("image"))
            Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
        if (htmlMeta.OpenGraph.ContainsKey("title"))
            Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
        if (htmlMeta.OpenGraph.ContainsKey("type"))
            Console.WriteLine($"Open Graph Type: {htmlMeta.OpenGraph["type"]}");
    }

    // Access Twitter Card fields from dictionary
    if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
    {
        if (htmlMeta.TwitterCard.ContainsKey("card"))
            Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
        if (htmlMeta.TwitterCard.ContainsKey("creator"))
            Console.WriteLine($"Twitter Creator: {htmlMeta.TwitterCard["creator"]}");
    }

    // Access new fields
    if (htmlMeta.Language != null)
        Console.WriteLine($"Language: {htmlMeta.Language}");

    if (htmlMeta.TextDirection != null)
        Console.WriteLine($"Text Direction: {htmlMeta.TextDirection}");

    // Access headers
    if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
        Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");

    // Access links
    if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
    {
        foreach (var link in htmlMeta.Links)
            Console.WriteLine($"Link: {link.Href} ({link.Text})");
    }

    // Access images
    if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
        Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");

    // Access structured data
    if (htmlMeta.StructuredData != null && htmlMeta.StructuredData.Count > 0)
        Console.WriteLine($"Structured Data items: {htmlMeta.StructuredData.Count}");
}

Go

package main

import (
    "fmt"
    "log"
    "strings"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        log.Fatalf("extract pdf: %v", err)
    }

    // Access PDF metadata
    if pdf, ok := result.Metadata.PdfMetadata(); ok {
        if pdf.PageCount != nil {
            fmt.Printf("Pages: %d\n", *pdf.PageCount)
        }
        if pdf.Author != nil {
            fmt.Printf("Author: %s\n", *pdf.Author)
        }
        if pdf.Title != nil {
            fmt.Printf("Title: %s\n", *pdf.Title)
        }
    }

    // Access HTML metadata
    htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
    if err != nil {
        log.Fatalf("extract html: %v", err)
    }
    if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
        if html.Title != nil {
            fmt.Printf("Title: %s\n", *html.Title)
        }
        if html.Description != nil {
            fmt.Printf("Description: %s\n", *html.Description)
        }

        // Access keywords as array
        if len(html.Keywords) > 0 {
            fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
        }

        // Access canonical URL (renamed from canonical)
        if html.CanonicalURL != nil {
            fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
        }

        // Access Open Graph fields from map
        if len(html.OpenGraph) > 0 {
            if image, ok := html.OpenGraph["image"]; ok {
                fmt.Printf("Open Graph Image: %s\n", image)
            }
            if ogTitle, ok := html.OpenGraph["title"]; ok {
                fmt.Printf("Open Graph Title: %s\n", ogTitle)
            }
            if ogType, ok := html.OpenGraph["type"]; ok {
                fmt.Printf("Open Graph Type: %s\n", ogType)
            }
        }

        // Access Twitter Card fields from map
        if len(html.TwitterCard) > 0 {
            if card, ok := html.TwitterCard["card"]; ok {
                fmt.Printf("Twitter Card Type: %s\n", card)
            }
            if creator, ok := html.TwitterCard["creator"]; ok {
                fmt.Printf("Twitter Creator: %s\n", creator)
            }
        }

        // Access new fields
        if html.Language != nil {
            fmt.Printf("Language: %s\n", *html.Language)
        }

        if html.TextDirection != nil {
            fmt.Printf("Text Direction: %s\n", *html.TextDirection)
        }

        // Access headers
        if len(html.Headers) > 0 {
            headers := make([]string, len(html.Headers))
            for i, h := range html.Headers {
                headers[i] = h.Text
            }
            fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
        }

        // Access links
        if len(html.Links) > 0 {
            for _, link := range html.Links {
                fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
            }
        }

        // Access images
        if len(html.Images) > 0 {
            for _, image := range html.Images {
                fmt.Printf("Image: %s\n", image.Src)
            }
        }

        // Access structured data
        if len(html.StructuredData) > 0 {
            fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
        }
    }
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
import java.util.List;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");

            // Access PDF metadata
            @SuppressWarnings("unchecked")
            Map<String, Object> pdfMeta = (Map<String, Object>) result.getMetadata().get("pdf");
            if (pdfMeta != null) {
                System.out.println("Pages: " + pdfMeta.get("page_count"));
                System.out.println("Author: " + pdfMeta.get("author"));
                System.out.println("Title: " + pdfMeta.get("title"));
            }

            // Access HTML metadata
            ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
            @SuppressWarnings("unchecked")
            Map<String, Object> htmlMeta = (Map<String, Object>) htmlResult.getMetadata().get("html");
            if (htmlMeta != null) {
                System.out.println("Title: " + htmlMeta.get("title"));
                System.out.println("Description: " + htmlMeta.get("description"));

                // Access keywords as array
                @SuppressWarnings("unchecked")
                List<String> keywords = (List<String>) htmlMeta.get("keywords");
                if (keywords != null) {
                    System.out.println("Keywords: " + keywords);
                }

                // Access canonical URL (renamed from canonical)
                String canonicalUrl = (String) htmlMeta.get("canonical_url");
                if (canonicalUrl != null) {
                    System.out.println("Canonical URL: " + canonicalUrl);
                }

                // Access Open Graph fields from map
                @SuppressWarnings("unchecked")
                Map<String, String> openGraph = (Map<String, String>) htmlMeta.get("open_graph");
                if (openGraph != null) {
                    System.out.println("Open Graph Image: " + openGraph.get("image"));
                    System.out.println("Open Graph Title: " + openGraph.get("title"));
                    System.out.println("Open Graph Type: " + openGraph.get("type"));
                }

                // Access Twitter Card fields from map
                @SuppressWarnings("unchecked")
                Map<String, String> twitterCard = (Map<String, String>) htmlMeta.get("twitter_card");
                if (twitterCard != null) {
                    System.out.println("Twitter Card Type: " + twitterCard.get("card"));
                    System.out.println("Twitter Creator: " + twitterCard.get("creator"));
                }

                // Access new fields
                String language = (String) htmlMeta.get("language");
                if (language != null) {
                    System.out.println("Language: " + language);
                }

                String textDirection = (String) htmlMeta.get("text_direction");
                if (textDirection != null) {
                    System.out.println("Text Direction: " + textDirection);
                }

                // Access headers
                @SuppressWarnings("unchecked")
                List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlMeta.get("headers");
                if (headers != null) {
                    headers.stream()
                        .map(h -> h.get("text"))
                        .forEach(text -> System.out.print(text + ", "));
                    System.out.println();
                }

                // Access links
                @SuppressWarnings("unchecked")
                List<Map<String, Object>> links = (List<Map<String, Object>>) htmlMeta.get("links");
                if (links != null) {
                    for (Map<String, Object> link : links) {
                        System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
                    }
                }

                // Access images
                @SuppressWarnings("unchecked")
                List<Map<String, Object>> images = (List<Map<String, Object>>) htmlMeta.get("images");
                if (images != null) {
                    for (Map<String, Object> image : images) {
                        System.out.println("Image: " + image.get("src"));
                    }
                }

                // Access structured data
                @SuppressWarnings("unchecked")
                List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlMeta.get("structured_data");
                if (structuredData != null) {
                    System.out.println("Structured data items: " + structuredData.size());
                }
            }
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}

Python

from kreuzberg import extract_file_sync, ExtractionConfig

result = extract_file_sync("document.pdf", config=ExtractionConfig())

pdf_meta: dict = result.metadata.get("pdf", {})
if pdf_meta:
    print(f"Pages: {pdf_meta.get('page_count')}")
    print(f"Author: {pdf_meta.get('author')}")
    print(f"Title: {pdf_meta.get('title')}")

result = extract_file_sync("page.html", config=ExtractionConfig())
html_meta: dict = result.metadata.get("html", {})
if html_meta:
    print(f"Title: {html_meta.get('title')}")
    print(f"Description: {html_meta.get('description')}")

    # Access keywords as array
    keywords = html_meta.get('keywords', [])
    if keywords:
        print(f"Keywords: {', '.join(keywords)}")

    # Access canonical URL (renamed from canonical)
    canonical_url = html_meta.get('canonical_url')
    if canonical_url:
        print(f"Canonical URL: {canonical_url}")

    # Access Open Graph fields from map
    open_graph = html_meta.get('open_graph', {})
    if open_graph:
        if 'image' in open_graph:
            print(f"Open Graph Image: {open_graph['image']}")
        if 'title' in open_graph:
            print(f"Open Graph Title: {open_graph['title']}")
        if 'type' in open_graph:
            print(f"Open Graph Type: {open_graph['type']}")

    # Access Twitter Card fields from map
    twitter_card = html_meta.get('twitter_card', {})
    if twitter_card:
        if 'card' in twitter_card:
            print(f"Twitter Card Type: {twitter_card['card']}")
        if 'creator' in twitter_card:
            print(f"Twitter Creator: {twitter_card['creator']}")

    # Access new fields
    language = html_meta.get('language')
    if language:
        print(f"Language: {language}")

    text_direction = html_meta.get('text_direction')
    if text_direction:
        print(f"Text Direction: {text_direction}")

    # Access headers
    headers = html_meta.get('headers', [])
    if headers:
        print(f"Headers: {', '.join([h['text'] for h in headers])}")

    # Access links
    links = html_meta.get('links', [])
    if links:
        for link in links:
            print(f"Link: {link.get('href')} ({link.get('text')})")

    # Access images
    images = html_meta.get('images', [])
    if images:
        for image in images:
            print(f"Image: {image.get('src')}")

    # Access structured data
    structured_data = html_meta.get('structured_data', [])
    if structured_data:
        print(f"Structured data items: {len(structured_data)}")

Ruby

require 'kreuzberg'

result = Kreuzberg.extract_file_sync('document.pdf')

# Access PDF metadata
if result.metadata['pdf']
  pdf_meta = result.metadata['pdf']
  puts "Pages: #{pdf_meta['page_count']}"
  puts "Author: #{pdf_meta['author']}"
  puts "Title: #{pdf_meta['title']}"
end

# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
if html_result.metadata['html']
  html_meta = html_result.metadata['html']
  puts "Title: #{html_meta['title']}"
  puts "Description: #{html_meta['description']}"

  # Access keywords as array
  puts "Keywords: #{html_meta['keywords']}"

  # Access canonical URL (renamed from canonical)
  puts "Canonical URL: #{html_meta['canonical_url']}" if html_meta['canonical_url']

  # Access Open Graph fields from map
  open_graph = html_meta['open_graph'] || {}
  puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
  puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
  puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']

  # Access Twitter Card fields from map
  twitter_card = html_meta['twitter_card'] || {}
  puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
  puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']

  # Access new fields
  puts "Language: #{html_meta['language']}" if html_meta['language']
  puts "Text Direction: #{html_meta['text_direction']}" if html_meta['text_direction']

  # Access headers
  if html_meta['headers']
    puts "Headers: #{html_meta['headers'].map { |h| h['text'] }.join(', ')}"
  end

  # Access links
  if html_meta['links']
    html_meta['links'].each do |link|
      puts "Link: #{link['href']} (#{link['text']})"
    end
  end

  # Access images
  if html_meta['images']
    html_meta['images'].each do |image|
      puts "Image: #{image['src']}"
    end
  end

  # Access structured data
  if html_meta['structured_data']
    puts "Structured data items: #{html_meta['structured_data'].length}"
  end
end

R

library(kreuzberg)

result <- extract_file_sync("document.pdf")

cat("Detected Language:", result$detected_language, "\n")
cat("Quality Score:", result$quality_score, "\n")
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")

cat("Metadata fields:\n")
author <- metadata_field(result, "author")
if (!is.null(author)) {
  cat("Author:", author, "\n")
}

created <- metadata_field(result, "created_date")
if (!is.null(created)) {
  cat("Created Date:", created, "\n")
}

pages_meta <- metadata_field(result, "total_pages")
if (!is.null(pages_meta)) {
  cat("Total Pages:", pages_meta, "\n")
}

Rust

use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;

    if let Some(pdf_meta) = result.metadata.pdf {
        if let Some(pages) = pdf_meta.page_count {
            println!("Pages: {}", pages);
        }
        if let Some(author) = pdf_meta.author {
            println!("Author: {}", author);
        }
        if let Some(title) = pdf_meta.title {
            println!("Title: {}", title);
        }
    }

    let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
    if let Some(html_meta) = html_result.metadata.html {
        if let Some(title) = html_meta.title {
            println!("Title: {}", title);
        }
        if let Some(desc) = html_meta.description {
            println!("Description: {}", desc);
        }

        // Access keywords array
        println!("Keywords: {:?}", html_meta.keywords);

        // Access canonical URL (renamed from canonical)
        if let Some(canonical) = html_meta.canonical_url {
            println!("Canonical URL: {}", canonical);
        }

        // Access Open Graph fields as a map
        if let Some(og_image) = html_meta.open_graph.get("image") {
            println!("Open Graph Image: {}", og_image);
        }
        if let Some(og_title) = html_meta.open_graph.get("title") {
            println!("Open Graph Title: {}", og_title);
        }

        // Access Twitter Card fields as a map
        if let Some(twitter_card) = html_meta.twitter_card.get("card") {
            println!("Twitter Card Type: {}", twitter_card);
        }

        // Access new fields
        if let Some(lang) = html_meta.language {
            println!("Language: {}", lang);
        }

        // Access headers
        if !html_meta.headers.is_empty() {
            for header in &html_meta.headers {
                println!("Header (level {}): {}", header.level, header.text);
            }
        }

        // Access links
        if !html_meta.links.is_empty() {
            for link in &html_meta.links {
                println!("Link: {} ({})", link.href, link.text);
            }
        }

        // Access images
        if !html_meta.images.is_empty() {
            for image in &html_meta.images {
                println!("Image: {}", image.src);
            }
        }

        // Access structured data
        if !html_meta.structured_data.is_empty() {
            println!("Structured data items: {}", html_meta.structured_data.len());
        }
    }
    Ok(())
}

Elixir

{:ok, result} = Kreuzberg.extract_file("document.pdf")

# Access format-specific metadata
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")

# Check PDF-specific metadata
case metadata["pdf"] do
  pdf_meta when is_map(pdf_meta) ->
    IO.puts("Page count: #{pdf_meta["page_count"]}")
    IO.puts("Author: #{pdf_meta["author"]}")
    IO.puts("Title: #{pdf_meta["title"]}")
  _ ->
    IO.puts("No PDF metadata available")
end

# Check HTML-specific metadata
case metadata["html"] do
  html_meta when is_map(html_meta) ->
    IO.puts("HTML keywords: #{inspect(html_meta["keywords"])}")
  _ ->
    IO.puts("No HTML metadata available")
end

TypeScript

import { extractFileSync } from '@kreuzberg/node';

const result = extractFileSync('document.pdf');
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.pageCount) {
    console.log(`Pages: ${result.metadata.pageCount}`);
}

const htmlResult = extractFileSync('page.html');
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);

const htmlMeta = htmlResult.metadata;
if (htmlMeta.title) {
    console.log(`Title: ${htmlMeta.title}`);
}

// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
    console.log(`Keywords: ${htmlMeta.keywords.join(', ')}`);
}

// Access canonical URL (renamed from canonical)
if (htmlMeta.canonicalUrl) {
    console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
}

// Access Open Graph fields from map
if (htmlMeta.openGraph) {
    if (htmlMeta.openGraph['image']) {
        console.log(`Open Graph Image: ${htmlMeta.openGraph['image']}`);
    }
    if (htmlMeta.openGraph['title']) {
        console.log(`Open Graph Title: ${htmlMeta.openGraph['title']}`);
    }
    if (htmlMeta.openGraph['type']) {
        console.log(`Open Graph Type: ${htmlMeta.openGraph['type']}`);
    }
}

// Access Twitter Card fields from map
if (htmlMeta.twitterCard) {
    if (htmlMeta.twitterCard['card']) {
        console.log(`Twitter Card Type: ${htmlMeta.twitterCard['card']}`);
    }
    if (htmlMeta.twitterCard['creator']) {
        console.log(`Twitter Creator: ${htmlMeta.twitterCard['creator']}`);
    }
}

// Access new fields
if (htmlMeta.language) {
    console.log(`Language: ${htmlMeta.language}`);
}

if (htmlMeta.textDirection) {
    console.log(`Text Direction: ${htmlMeta.textDirection}`);
}

// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
    console.log(`Headers: ${htmlMeta.headers.map(h => h.text).join(', ')}`);
}

// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
    htmlMeta.links.forEach((link) => {
        console.log(`Link: ${link.href} (${link.text})`);
    });
}

// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
    htmlMeta.images.forEach((image) => {
        console.log(`Image: ${image.src}`);
    });
}

// Access structured data
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
    console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
}

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    const result = await extractFromFile(file);
    console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

    // Access common metadata fields
    if (result.metadata.title) {
        console.log(`Title: ${result.metadata.title}`);
    }

    // Access format-specific metadata
    const metadata = result.metadata;

    // For HTML files
    if (metadata.html) {
        const htmlMeta = metadata.html;
        console.log(`HTML Title: ${htmlMeta.title}`);
        console.log(`Description: ${htmlMeta.description}`);

        // Access keywords as array
        if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
            console.log(`Keywords: ${htmlMeta.keywords.join(', ')}`);
        }

        // Access canonical URL
        if (htmlMeta.canonical_url) {
            console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
        }

        // Access Open Graph fields
        if (htmlMeta.open_graph) {
            if (htmlMeta.open_graph['title']) {
                console.log(`OG Title: ${htmlMeta.open_graph['title']}`);
            }
            if (htmlMeta.open_graph['image']) {
                console.log(`OG Image: ${htmlMeta.open_graph['image']}`);
            }
        }

        // Access Twitter Card fields
        if (htmlMeta.twitter_card && htmlMeta.twitter_card['card']) {
            console.log(`Twitter Card Type: ${htmlMeta.twitter_card['card']}`);
        }

        // Access headers
        if (htmlMeta.headers && htmlMeta.headers.length > 0) {
            console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(', ')}`);
        }

        // Access links
        if (htmlMeta.links && htmlMeta.links.length > 0) {
            htmlMeta.links.forEach((link: any) => {
                console.log(`Link: ${link.href} (${link.text})`);
            });
        }

        // Access images
        if (htmlMeta.images && htmlMeta.images.length > 0) {
            htmlMeta.images.forEach((image: any) => {
                console.log(`Image: ${image.src}`);
            });
        }

        // Access structured data
        if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
            console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
        }
    }

    // For PDF files
    if (metadata.pdf) {
        const pdfMeta = metadata.pdf;
        if (pdfMeta.page_count) {
            console.log(`Pages: ${pdfMeta.page_count}`);
        }
        if (pdfMeta.author) {
            console.log(`Author: ${pdfMeta.author}`);
        }
    }
}

Extract and parse metadata using JSON output:

Terminal

# Extract with metadata (JSON format includes metadata automatically)
kreuzberg extract document.pdf --format json

# Save to file and parse metadata
kreuzberg extract document.pdf --format json > result.json

# Extract PDF metadata
cat result.json | jq '.metadata.pdf'

# Extract HTML metadata
kreuzberg extract page.html --format json | jq '.metadata.html'

# Get specific fields
kreuzberg extract document.pdf --format json | \
  jq '.metadata | {page_count, author, title}'

# Process multiple files
kreuzberg batch documents/*.pdf --format json > all_metadata.json

JSON Output Structure:

JSON

{
  "content": "Extracted text...",
  "metadata": {
    "mime_type": "application/pdf",
    "pdf": {
      "page_count": 10,
      "author": "John Doe",
      "title": "Document Title"
    }
  }
}

Kreuzberg extracts format-specific metadata for:

PDF: page count, title, author, subject, keywords, dates
HTML: Rich metadata including SEO tags, Open Graph, Twitter Card, structured data, headers, links, images
Excel: sheet count, sheet names
Email: from, to, CC, BCC, message ID, attachments
PowerPoint: title, author, description, fonts
Images: dimensions, format, EXIF data
Archives: format, file count, file list, sizes
XML: element count, unique elements
Text/Markdown: word count, line count, headers, links

HTML Metadata Structure (v4.0+)

HTML metadata has been restructured for better organization: - keywords: Now a Vec<String> array (was Option<String>) - canonical → canonical_url: Renamed for clarity - Open Graph fields: Consolidated into open_graph: Map<String, String> (replacing individual og_* fields) - Twitter Card fields: Consolidated into twitter_card: Map<String, String> (replacing individual twitter_* fields) - New fields: headers, links, images, structured_data, language, text_direction, meta_tags

See Types Reference for complete HTML metadata reference and examples.

See Types Reference for complete metadata reference.

Working with Tables¶

Extract and process tables from documents:

C#GoJavaPythonRubyRRustElixirTypeScriptWASMCLI

C#

using Kreuzberg;

var result = KreuzbergClient.ExtractFileSync("document.pdf", new ExtractionConfig());

foreach (var table in result.Tables)
{
    Console.WriteLine($"Table with {table.Cells.Count} rows");
    Console.WriteLine(table.Markdown);

    foreach (var row in table.Cells)
    {
        Console.WriteLine(string.Join(" | ", row));
    }
}

Go

package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    // Iterate over tables
    for _, table := range result.Tables {
        fmt.Printf("Table with %d rows\n", len(table.Cells))
        fmt.Println(table.Markdown) // Markdown representation

        // Access cells
        for _, row := range table.Cells {
            fmt.Println(row)
        }
    }
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionResult result = Kreuzberg.extractFile("document.pdf");

            for (Table table : result.getTables()) {
                System.out.println("Table with " + table.cells().size() + " rows");
                System.out.println(table.markdown());

                for (List<String> row : table.cells()) {
                    System.out.println(row);
                }
            }
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}

Python

from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable

result = extract_file_sync("document.pdf", config=ExtractionConfig())

for table in result.tables:
    row_count: int = len(table.cells)
    print(f"Table with {row_count} rows")
    print(table.markdown)
    for row in table.cells:
        print(row)

Ruby

require 'kreuzberg'

result = Kreuzberg.extract_file_sync('document.pdf')

# Iterate over tables
result.tables.each do |table|
  puts "Table with #{table['cells'].length} rows"
  puts table['markdown']  # Markdown representation

  # Access cells
  table['cells'].each do |row|
    puts row
  end
end

R

library(kreuzberg)

result <- extract_file_sync("spreadsheet.xlsx")

cat("Tables extracted:", length(result$tables), "\n\n")

for (i in seq_along(result$tables)) {
  table <- result$tables[[i]]
  cat(sprintf("Table %d:\n", i))
  cat("  Rows:", nrow(table), "\n")
  cat("  Columns:", ncol(table), "\n")
  cat("  Column names:", paste(colnames(table), collapse=", "), "\n")
  cat("\n")

  if (nrow(table) > 0L) {
    cat("  Preview (first 3 rows):\n")
    print(head(table, 3L))
    cat("\n")
  }
}

Rust

use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;

    for table in &result.tables {
        println!("Table with {} rows", table.cells.len());
        println!("{}", table.markdown);

        for row in &table.cells {
            println!("{:?}", row);
        }
    }
    Ok(())
}

Elixir

{:ok, result} = Kreuzberg.extract_file("document.pdf")

tables = result.tables
IO.puts("Total tables found: #{length(tables)}")

Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
  IO.puts("\n--- Table #{index} ---")

  # Access table cells
  cells = table["cells"] || []
  IO.puts("Rows: #{length(cells)}")

  # Access table markdown representation
  markdown = table["markdown"]
  IO.puts("Markdown representation:")
  IO.puts(markdown)
end)

TypeScript

import { extractFileSync } from '@kreuzberg/node';

const result = extractFileSync('document.pdf');

for (const table of result.tables) {
    console.log(`Table with ${table.cells.length} rows`);
    console.log(`Page: ${table.pageNumber}`);
    console.log(table.markdown);
}

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    const result = await extractFromFile(file);

    for (const table of result.tables) {
        console.log(`Table with ${table.cells.length} rows`);
        console.log(`Page: ${table.pageNumber}`);
        console.log(table.markdown);
    }
}

Extract and process tables from documents:

Terminal

# Extract with JSON format (includes tables when detected)
kreuzberg extract document.pdf --format json

# Save tables to JSON
kreuzberg extract spreadsheet.xlsx --format json > tables.json

# Extract and parse table markdown
kreuzberg extract document.pdf --format json | \
  jq '.tables[]? | .markdown'

# Get table cells
kreuzberg extract document.pdf --format json | \
  jq '.tables[]? | .cells'

# Batch extract tables from multiple files
kreuzberg batch documents/**/*.pdf --format json > all_tables.json

JSON Table Structure:

JSON

{
  "content": "...",
  "tables": [
    {
      "cells": [
        ["Name", "Age", "City"],
        ["Alice", "30", "New York"],
        ["Bob", "25", "Los Angeles"]
      ],
      "markdown": "| Name | Age | City |\n|------|-----|--------|\n| Alice | 30 | New York |\n| Bob | 25 | Los Angeles |"
    }
  ]
}

Error Handling¶

Handle extraction errors gracefully:

C#GoJavaPythonRubyRRustElixirTypeScriptWASM

C#

using Kreuzberg;

try
{
    var result = KreuzbergClient.ExtractFileSync("missing.pdf");
    Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
    Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
    Console.Error.WriteLine($"IO error: {ex.Message}");
    throw;
}
catch (KreuzbergException ex)
{
    Console.Error.WriteLine($"Extraction failed: {ex.Message}");
    throw;
}

Go

package main

import (
    "errors"
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        switch {
        case errors.As(err, new(*kreuzberg.ValidationError)):
            log.Fatalf("invalid configuration: %v", err)
        case errors.As(err, new(*kreuzberg.ParsingError)):
            log.Fatalf("failed to parse document: %v", err)
        case errors.As(err, new(*kreuzberg.OCRError)):
            log.Fatalf("OCR processing failed: %v", err)
        case errors.As(err, new(*kreuzberg.MissingDependencyError)):
            log.Fatalf("missing dependency: %v", err)
        default:
            log.Fatalf("extraction error: %v", err)
        }
    }

    fmt.Println(result.Content)
}

Java

import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;

try {
    ExtractionResult result = Kreuzberg.extractFile("document.pdf");
    System.out.println("Extracted: " + result.getContent()
        .substring(0, Math.min(100, result.getContent().length())));
} catch (IOException e) {
    System.err.println("File not found: " + e.getMessage());
} catch (KreuzbergException e) {
    System.err.println("Extraction failed: " + e.getMessage());
}

try {
    byte[] pdfBytes = new byte[] { };
    ExtractionResult result = Kreuzberg.extractBytes(pdfBytes, "application/pdf", null);
    System.out.println("Extracted " + result.getContent().length() + " characters");
} catch (KreuzbergException e) {
    System.err.println("Extraction failed: " + e.getMessage());
}

Python

from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
    KreuzbergError,
    ParsingError,
    OCRError,
    ValidationError,
)

try:
    result = extract_file_sync("document.pdf")
    print(f"Extracted {len(result.content)} characters")
except ParsingError as e:
    print(f"Failed to parse document: {e}")
except OCRError as e:
    print(f"OCR processing failed: {e}")
except KreuzbergError as e:
    print(f"Extraction error: {e}")

try:
    config: ExtractionConfig = ExtractionConfig()
    pdf_bytes: bytes = b"%PDF-1.4\n"
    result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
    print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
    print(f"Invalid configuration: {e}")
except OCRError as e:
    print(f"OCR failed: {e}")
except KreuzbergError as e:
    print(f"Extraction failed: {e}")

Ruby

require 'kreuzberg'

begin
  result = Kreuzberg.extract_file_sync('document.pdf')
  puts result.content
rescue Kreuzberg::ValidationError => e
  puts "Invalid configuration: #{e.message}"
rescue Kreuzberg::ParsingError => e
  puts "Failed to parse document: #{e.message}"
rescue Kreuzberg::OCRError => e
  puts "OCR processing failed: #{e.message}"
rescue Kreuzberg::MissingDependencyError => e
  puts "Missing dependency: #{e.message}"
rescue Kreuzberg::Error => e
  puts "Extraction error: #{e.message}"
rescue StandardError => e
  puts "System error: #{e.message}"
end

R

library(kreuzberg)

# Handle extraction errors with typed conditions
result <- tryCatch({
  extract_file_sync("document.xyz")
},
  UnsupportedFileType = function(e) {
    cat("Error: File type not supported\n")
    cat("Message:", conditionMessage(e), "\n")
    NULL
  },
  ValidationError = function(e) {
    cat("Error: Validation failed\n")
    cat("Message:", conditionMessage(e), "\n")
    NULL
  },
  kreuzberg_error = function(e) {
    cat("Error: Kreuzberg extraction failed\n")
    cat("Message:", conditionMessage(e), "\n")
    NULL
  }
)

if (!is.null(result)) {
  cat("Extraction successful\n")
}

Rust

use kreuzberg::{extract_file_sync, extract_bytes_sync, ExtractionConfig, KreuzbergError};

fn main() -> kreuzberg::Result<()> {
    match extract_file_sync("document.pdf", None, &ExtractionConfig::default()) {
        Ok(result) => {
            println!("Extracted {} characters", result.content.len());
        }
        Err(KreuzbergError::Parsing { message, .. }) => {
            eprintln!("Failed to parse document: {}", message);
        }
        Err(KreuzbergError::Ocr { message, .. }) => {
            eprintln!("OCR processing failed: {}", message);
        }
        Err(KreuzbergError::MissingDependency { message, .. }) => {
            eprintln!("Missing dependency: {}", message);
        }
        Err(e) => {
            eprintln!("Extraction failed: {}", e);
        }
    }

    let pdf_bytes = b"%PDF-1.4\n...";
    match extract_bytes_sync(pdf_bytes, "application/pdf", None, &ExtractionConfig::default()) {
        Ok(result) => {
            println!("Extracted: {}", &result.content[..100.min(result.content.len())]);
            Ok(())
        }
        Err(KreuzbergError::Validation { message, .. }) => {
            eprintln!("Invalid configuration: {}", message);
            Err(KreuzbergError::Validation {
                message: message.clone(),
                source: None,
            })
        }
        Err(KreuzbergError::Ocr { message, .. }) => {
            eprintln!("OCR failed: {}", message);
            Err(KreuzbergError::Ocr {
                message: message.clone(),
                source: None,
            })
        }
        Err(e) => {
            eprintln!("Extraction failed: {}", e);
            Err(e)
        }
    }
}

Elixir

# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
  {:ok, result} ->
    IO.puts("Successfully extracted content")
    IO.puts("Content length: #{byte_size(result.content)} characters")

  {:error, reason} ->
    IO.puts("Extraction failed: #{reason}")
end

# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")

case result do
  {:ok, data} ->
    IO.puts("File processed successfully")
  {:error, error} ->
    IO.puts("Error details: #{inspect(error)}")
end

# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
  {:ok, result} ->
    IO.puts("Content: #{result.content}")
  {:error, msg} when is_binary(msg) ->
    IO.puts("Validation error: #{msg}")
  {:error, reason} ->
    IO.puts("Unknown error: #{inspect(reason)}")
end

TypeScript

import { extractFileSync, KreuzbergError } from '@kreuzberg/node';

try {
    const result = extractFileSync('document.pdf');
    console.log(result.content);
} catch (error) {
    if (error instanceof KreuzbergError) {
        console.error(`Extraction error: ${error.message}`);
    } else {
        throw error;
    }
}

WASM

import { extractFromFile, initWasm } from '@kreuzberg/wasm';

await initWasm();

const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
    try {
        const result = await extractFromFile(file);
        console.log(result.content);
    } catch (error) {
        if (error instanceof Error) {
            console.error(`Extraction error: ${error.message}`);
        } else {
            throw error;
        }
    }
}

Next Steps¶

Contributing - Learn how to contribute to Kreuzberg