Skip to content

Configuration Guide v4.0.0

All extraction behavior is controlled through ExtractionConfig. Pass it directly in code or load it from a TOML/YAML/JSON file. Every field is optional. For per-field documentation, see the Configuration Reference.

Quick Start

Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    config = ExtractionConfig(
        use_cache=True,
        enable_quality_processing=True
    )
    result = await extract_file("document.pdf", config=config)
    print(result.content)

asyncio.run(main())
TypeScript
import { extractFile } from "@kreuzberg/node";

const config = {
  useCache: true,
  enableQualityProcessing: true,
};

const result = await extractFile("document.pdf", null, config);
console.log(result.content);
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        use_cache: true,
        enable_quality_processing: true,
        ..Default::default()
    };

    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

func main() {
    useCache := true
    enableQP := true

    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        UseCache:                &useCache,
        EnableQualityProcessing: &enableQP,
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .useCache(true)
    .enableQualityProcessing(true)
    .build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
C#
using Kreuzberg;

var config = new ExtractionConfig
{
    UseCache = true,
    EnableQualityProcessing = true
};

var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  use_cache: true,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)
R
library(kreuzberg)

config <- list(
  output_format = "markdown"
)

json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))

Configuration Files

Three formats are supported. TOML is recommended.

kreuzberg.toml
use_cache = true
enable_quality_processing = true

[ocr]
backend = "tesseract"
language = "eng"

[ocr.tesseract_config]
psm = 3
kreuzberg.yaml
use_cache: true
enable_quality_processing: true

ocr:
  backend: tesseract
  language: eng
  tesseract_config:
    psm: 3
kreuzberg.json
{
  "use_cache": true,
  "enable_quality_processing": true,
  "ocr": {
    "backend": "tesseract",
    "language": "eng",
    "tesseract_config": {
      "psm": 3
    }
  }
}

Automatic Discovery

When no --config path is supplied, Kreuzberg walks up from the current working directory looking for kreuzberg.toml and uses the first match. YAML and JSON files are supported only when passed explicitly via --config. If nothing is found, defaults are used.

Python
import asyncio
from kreuzberg import ExtractionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    content_preview: str = content[:100]

    print(f"Content preview: {content_preview}")
    print(f"Total length: {len(content)}")

asyncio.run(main())
TypeScript
import { extractFile, ExtractionConfig } from "@kreuzberg/node";

const config = ExtractionConfig.discover();
if (config) {
  const result = await extractFile("document.pdf", null, config);
  console.log(result.content);
} else {
  console.log("No configuration file found");
}
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::discover()?.unwrap_or_default();
    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

func main() {
    config, err := kreuzberg.LoadExtractionConfigFromFile("")
    if err != nil {
        log.Fatalf("discover config failed: %v", err)
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Printf("Content length: %d", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;

ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
C#
using Kreuzberg;

var config = new ExtractionConfig();
var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);

Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
Console.WriteLine($"Total length: {result.Content.Length}");
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
R
library(kreuzberg)

# Load configuration from a JSON file and pass it to extract_file_sync.
config_json <- paste(readLines("kreuzberg.json"), collapse = "\n")
config <- ExtractionConfig$from_json(config_json)

json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
WASM
import { initWasm, extractBytes } from "@kreuzberg/wasm";

await initWasm();

const config = {
  use_cache: true,
  enable_quality_processing: true,
  ocr: {
    backend: "tesseract-wasm",
    language: "eng",
  },
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(result.content);

Common Use Cases

Setting Up OCR

Python
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        ocr=OcrConfig(
            backend="tesseract", language="eng+fra",
            tesseract_config=TesseractConfig(psm=3)
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(result.content)

asyncio.run(main())
TypeScript
import { extractFile } from "@kreuzberg/node";

const config = {
  ocr: {
    backend: "tesseract",
    language: "eng+fra",
    tesseractConfig: {
      psm: 3,
    },
  },
};

const result = await extractFile("document.pdf", null, config);
console.log(result.content);
Rust
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};

fn main() {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng+fra".to_string(),
            tesseract_config: Some(TesseractConfig {
                psm: 3,
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };
}
Go
package main

import "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"

func main() {
    psm := int32(3)

    _ = kreuzberg.ExtractionConfig{
        Ocr: &kreuzberg.OcrConfig{
            Backend:  "tesseract",
            Language: "eng+fra",
            TesseractConfig: &kreuzberg.TesseractConfig{
                Psm: &psm,
            },
        },
    }
}
Java
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .ocr(OcrConfig.builder()
        .backend("tesseract")
        .language("eng+fra")
        .tesseractConfig(TesseractConfig.builder()
            .psm(3)
            .build())
        .build())
    .build();
C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Ocr = new OcrConfig
    {
        Backend = "tesseract",
        Language = "eng+fra",
        TesseractConfig = new TesseractConfig { Psm = 3 }
    }
};

var result = await KreuzbergLib.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  ocr: Kreuzberg::OcrConfig.new(
    backend: 'tesseract',
    language: 'eng+fra',
    tesseract_config: Kreuzberg::TesseractConfig.new(psm: 3)
  )
)
R
library(kreuzberg)

config <- list(
  force_ocr = TRUE,
  ocr = list(backend = "tesseract", language = "eng")
)

json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("Extracted content length: %d\n", nchar(result$content)))
cat(sprintf("Detected language: %s\n", result$detected_language))

For backend selection and language packs, see OCR Guide. For fine-grained Tesseract tuning, see TesseractConfig Reference.

Chunking for RAG

Python
from kreuzberg import (
    ExtractionConfig,
    ChunkingConfig,
    EmbeddingConfig,
    EmbeddingModelType,
)

config: ExtractionConfig = ExtractionConfig(
    chunking=ChunkingConfig(
        max_chars=1500,
        max_overlap=200,
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.preset("all-minilm-l6-v2")
        ),
    )
)
TypeScript
import { extractFile } from "@kreuzberg/node";

const config = {
  chunking: {
    maxChars: 1500,
    maxOverlap: 200,
    embedding: {
      preset: "quality",
    },
  },
};

const result = await extractFile("document.pdf", null, config);
console.log(`Chunks created: ${result.chunks?.length ?? 0}`);
Rust
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};

fn main() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_characters: 1500,
            overlap: 200,
            embedding: Some(EmbeddingConfig {
                model: EmbeddingModelType::Preset {
                    name: "text-embedding-all-minilm-l6-v2".to_string(),
                },
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };
    println!("{:?}", config.chunking);
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

func main() {
    maxChars := 1000
    maxOverlap := 200
    config := &kreuzberg.ExtractionConfig{
        Chunking: &kreuzberg.ChunkingConfig{
            MaxChars:   &maxChars,
            MaxOverlap: &maxOverlap,
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    for i, chunk := range result.Chunks {
        fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
        fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
    }
}

func min(a, b int) int {
    if a < b {
        return a
    }
    return b
}
Java
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import dev.kreuzberg.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .chunking(ChunkingConfig.builder()
        .maxChars(1500)
        .maxOverlap(200)
        .embedding(EmbeddingConfig.builder()
            .model(EmbeddingModelType.builder()
                .type("preset")
                .name("text-embedding-all-minilm-l6-v2")
                .build())
            .build())
        .build())
    .build();
C#
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;

var config = new ExtractionConfig
{
    Chunking = new ChunkingConfig
    {
        MaxChars = 512,
        MaxOverlap = 50,
        Embedding = new EmbeddingConfig
        {
            Model = EmbeddingModelType.Preset("balanced"),
            Normalize = true,
            BatchSize = 32,
            ShowDownloadProgress = false
        }
    }
};

var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);

var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
    var chunkId = $"doc_chunk_{index}";
    Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");

    if (chunk.Embedding != null)
    {
        Console.WriteLine($"  Embedding dimensions: {chunk.Embedding.Length}");
    }
}

internal static class EnumerableExtensions
{
    public static IEnumerable<(int Index, T Item)> WithIndex<T>(
        this IEnumerable<T> items)
    {
        var index = 0;
        foreach (var item in items)
        {
            yield return (index++, item);
        }
    }
}
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  chunking: Kreuzberg::ChunkingConfig.new(
    max_characters: 1500,
    overlap: 200,
    embedding: Kreuzberg::EmbeddingConfig.new(
      model: Kreuzberg::EmbeddingModelType.new(
        type: 'preset',
        name: 'text-embedding-all-minilm-l6-v2'
      )
    )
  )
)
R
library(kreuzberg)

config <- list(
  chunking = list(max_characters = 1000L, overlap = 200L)
)

json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("Total chunks: %d\n", length(result$chunks)))
for (i in seq_len(min(5L, length(result$chunks)))) {
  cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
}

All Configuration Categories

Next Steps

Edit this page on GitHub