Skip to content

Configuration Guide

For complete configuration field documentation, see Configuration Reference.

Kreuzberg's behavior is controlled through configuration objects. All settings are optional with sensible defaults, allowing you to configure only what you need.

Configuration Discovery

Kreuzberg automatically discovers configuration files in these locations (in order):

  1. Current directory: ./kreuzberg.{toml,yaml,yml,json}
  2. User config: ~/.config/kreuzberg/config.{toml,yaml,yml,json}
  3. System config: /etc/kreuzberg/config.{toml,yaml,yml,json}

Once found, configuration is merged with defaults. No configuration file is required—if none is found, defaults are used.

Quick Start

Configuration Methods

Kreuzberg supports four ways to configure extraction:

C#
using Kreuzberg;

var config = new ExtractionConfig();
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);

Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
Console.WriteLine($"Total length: {result.Content.Length}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    config, err := kreuzberg.LoadExtractionConfigFromFile("")
    if err != nil {
        log.Fatalf("discover config failed: %v", err)
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Printf("Content length: %d", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import ExtractionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    content_preview: str = content[:100]

    print(f"Content preview: {content_preview}")
    print(f"Total length: {len(content)}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::discover()?;
    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
TypeScript
import { extractFile, ExtractionConfig } from '@kreuzberg/node';

const config = ExtractionConfig.discover();
if (config) {
  const result = await extractFile('document.pdf', null, config);
  console.log(result.content);
} else {
  console.log('No configuration file found');
}
WASM
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const config = {
  use_cache: true,
  enable_quality_processing: true,
  ocr: {
    backend: 'tesseract-wasm',
    language: 'eng'
  }
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
kreuzberg.toml
use_cache = true
enable_quality_processing = true

[ocr]
backend = "tesseract"
language = "eng"

[ocr.tesseract_config]
psm = 3
# kreuzberg.yaml
use_cache: true
enable_quality_processing: true

ocr:
  backend: tesseract
  language: eng
  tesseract_config:
    psm: 3
{
  "use_cache": true,
  "enable_quality_processing": true,
  "ocr": {
    "backend": "tesseract",
    "language": "eng",
    "tesseract_config": {
      "psm": 3
    }
  }
}

Configuration Discovery

flowchart TD
    Start[ExtractionConfig.discover] --> Current{Check Current Directory}

    Current -->|Found| LoadCurrent[Load ./kreuzberg.*]
    Current -->|Not Found| User{Check User Config}

    User -->|Found| LoadUser[Load ~/.config/kreuzberg/config.*]
    User -->|Not Found| System{Check System Config}

    System -->|Found| LoadSystem[Load /etc/kreuzberg/config.*]
    System -->|Not Found| Default[Use Default Config]

    LoadCurrent --> Merge[Merge with Defaults]
    LoadUser --> Merge
    LoadSystem --> Merge
    Default --> Return[Return Config]

    Merge --> Return

    style LoadCurrent fill:#90EE90
    style LoadUser fill:#87CEEB
    style LoadSystem fill:#FFD700
    style Default fill:#FFB6C1

Kreuzberg automatically discovers configuration files in the following locations (in order):

  1. Current directory: ./kreuzberg.{toml,yaml,yml,json}
  2. User config: ~/.config/kreuzberg/config.{toml,yaml,yml,json}
  3. System config: /etc/kreuzberg/config.{toml,yaml,yml,json}
C#
using Kreuzberg;

var config = new ExtractionConfig();
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);

Console.WriteLine(result.Content[..Math.Min(100, result.Content.Length)]);
Console.WriteLine($"Total length: {result.Content.Length}");
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    config, err := kreuzberg.LoadExtractionConfigFromFile("")
    if err != nil {
        log.Fatalf("discover config failed: %v", err)
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Printf("Content length: %d", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = Kreuzberg.discoverExtractionConfig();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import ExtractionConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig()
    result = await extract_file("document.pdf", config=config)

    content: str = result.content
    content_preview: str = content[:100]

    print(f"Content preview: {content_preview}")
    print(f"Total length: {len(content)}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.discover
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::discover()?;
    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
TypeScript
import { extractFile, ExtractionConfig } from '@kreuzberg/node';

const config = ExtractionConfig.discover();
if (config) {
  const result = await extractFile('document.pdf', null, config);
  console.log(result.content);
} else {
  console.log('No configuration file found');
}
WASM
import { initWasm, extractBytes } from '@kreuzberg/wasm';

await initWasm();

const config = {
  use_cache: true,
  enable_quality_processing: true,
  ocr: {
    backend: 'tesseract-wasm',
    language: 'eng'
  }
};

const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);

Common Use Cases

Basic Configuration

Get started with minimal configuration:

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    UseCache = true,
    EnableQualityProcessing = true
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    useCache := true
    enableQP := true

    result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
        UseCache:                &useCache,
        EnableQualityProcessing: &enableQP,
    })
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    log.Println("content length:", len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .useCache(true)
    .enableQualityProcessing(true)
    .build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    config = ExtractionConfig(
        use_cache=True,
        enable_quality_processing=True
    )
    result = await extract_file("document.pdf", config=config)
    print(result.content)

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  use_cache: true,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        use_cache: true,
        enable_quality_processing: true,
        ..Default::default()
    };

    let result = extract_file("document.pdf", None, &config).await?;
    println!("{}", result.content);
    Ok(())
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    useCache: true,
    enableQualityProcessing: true,
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

Setting Up OCR

Enable OCR for scanned documents and images:

C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Ocr = new OcrConfig
    {
        Backend = "tesseract",
        Language = "eng+fra",
        TesseractConfig = new TesseractConfig { Psm = 3 }
    }
};

var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine(result.Content);
Go
package main

import "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"

func main() {
    language := "eng+fra"
    psm := 3

    _ = &kreuzberg.ExtractionConfig{
        OCR: &kreuzberg.OCRConfig{
            Backend:  "tesseract",
            Language: &language,
            Tesseract: &kreuzberg.TesseractConfig{
                PSM: &psm,
            },
        },
    }
}
Java
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .ocr(OcrConfig.builder()
        .backend("tesseract")
        .language("eng+fra")
        .tesseractConfig(TesseractConfig.builder()
            .psm(3)
            .build())
        .build())
    .build();
Python
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        ocr=OcrConfig(
            backend="tesseract", language="eng+fra",
            tesseract_config=TesseractConfig(psm=3)
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(result.content)

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  ocr: Kreuzberg::Config::OCR.new(
    backend: 'tesseract',
    language: 'eng+fra',
    tesseract_config: Kreuzberg::Config::Tesseract.new(psm: 3)
  )
)
Rust
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};

fn main() {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: Some("eng+fra".to_string()),
            tesseract_config: Some(TesseractConfig {
                psm: 3,
                ..Default::default()
            }),
        }),
        ..Default::default()
    };
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    ocr: {
        backend: 'tesseract',
        language: 'eng+fra',
        tesseractConfig: {
            psm: 3,
        },
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(result.content);

For more OCR options, see Tesseract Configuration in the reference.

Chunking for RAG

Configure text chunking for vector database ingestion:

C#
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;

var config = new ExtractionConfig
{
    Chunking = new ChunkingConfig
    {
        MaxChars = 512,
        MaxOverlap = 50,
        Embedding = new EmbeddingConfig
        {
            Model = EmbeddingModelType.Preset("balanced"),
            Normalize = true,
            BatchSize = 32,
            ShowDownloadProgress = false
        }
    }
};

var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);

var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
    var chunkId = $"doc_chunk_{index}";
    Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");

    if (chunk.Embedding != null)
    {
        Console.WriteLine($"  Embedding dimensions: {chunk.Embedding.Length}");
    }
}

internal static class EnumerableExtensions
{
    public static IEnumerable<(int Index, T Item)> WithIndex<T>(
        this IEnumerable<T> items)
    {
        var index = 0;
        foreach (var item in items)
        {
            yield return (index++, item);
        }
    }
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    maxChars := 1000
    maxOverlap := 200
    config := &kreuzberg.ExtractionConfig{
        Chunking: &kreuzberg.ChunkingConfig{
            MaxChars:   &maxChars,
            MaxOverlap: &maxOverlap,
        },
    }

    result, err := kreuzberg.ExtractFileSync("document.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    for i, chunk := range result.Chunks {
        fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
        fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
    }
}

func min(a, b int) int {
    if a < b {
        return a
    }
    return b
}
Java
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import dev.kreuzberg.config.ExtractionConfig;

ExtractionConfig config = ExtractionConfig.builder()
    .chunking(ChunkingConfig.builder()
        .maxChars(1500)
        .maxOverlap(200)
        .embedding(EmbeddingConfig.builder()
            .model(EmbeddingModelType.builder()
                .type("preset")
                .name("text-embedding-all-minilm-l6-v2")
                .build())
            .build())
        .build())
    .build();
Python
from kreuzberg import (
    ExtractionConfig,
    ChunkingConfig,
    EmbeddingConfig,
    EmbeddingModelType,
)

config: ExtractionConfig = ExtractionConfig(
    chunking=ChunkingConfig(
        max_chars=1500,
        max_overlap=200,
        embedding=EmbeddingConfig(
            model=EmbeddingModelType.preset("all-minilm-l6-v2")
        ),
    )
)
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  chunking: Kreuzberg::Config::Chunking.new(
    max_chars: 1500,
    max_overlap: 200,
    embedding: Kreuzberg::Config::Embedding.new(
      model: Kreuzberg::EmbeddingModelType.new(
        type: 'preset',
        name: 'text-embedding-all-minilm-l6-v2'
      )
    )
  )
)
Rust
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};

fn main() {
    let config = ExtractionConfig {
        chunking: Some(ChunkingConfig {
            max_chars: Some(1500),
            max_overlap: Some(200),
            embedding: Some(EmbeddingConfig {
                model: Some(EmbeddingModelType {
                    r#type: "preset".to_string(),
                    name: Some("text-embedding-all-minilm-l6-v2".to_string()),
                    ..Default::default()
                }),
                ..Default::default()
            }),
            ..Default::default()
        }),
        ..Default::default()
    };
    println!("{:?}", config.chunking);
}
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    chunking: {
        maxChars: 1500,
        maxOverlap: 200,
        embedding: {
            preset: 'quality',
        },
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(`Chunks created: ${result.chunks?.length ?? 0}`);

File Format Examples

kreuzberg.toml
use_cache = true
enable_quality_processing = true

[ocr]
backend = "tesseract"
language = "eng"

[ocr.tesseract_config]
psm = 3

YAML

kreuzberg.yaml
use_cache: true
enable_quality_processing: true

ocr:
  backend: tesseract
  language: eng
  tesseract_config:
    psm: 3

JSON

kreuzberg.json
{
  "use_cache": true,
  "enable_quality_processing": true,
  "ocr": {
    "backend": "tesseract",
    "language": "eng",
    "tesseract_config": {
      "psm": 3
    }
  }
}

Configuration Field Reference

For complete documentation of all configuration fields, see Configuration Reference.

Key sections include: - ExtractionConfig - Main configuration - OcrConfig - OCR options - TesseractConfig - Fine-grained OCR tuning - ChunkingConfig - Text chunking - TokenReductionConfig - Token optimization - PageConfig - Page tracking - All other configs - Complete field list