Configuration Guide¶
For complete configuration field documentation, see Configuration Reference.
Kreuzberg's behavior is controlled through configuration objects. All settings are optional with sensible defaults, allowing you to configure only what you need.
Configuration Discovery¶
Kreuzberg automatically discovers configuration files in these locations (in order):
- Current directory:
./kreuzberg.{toml,yaml,yml,json} - User config:
~/.config/kreuzberg/config.{toml,yaml,yml,json} - System config:
/etc/kreuzberg/config.{toml,yaml,yml,json}
Once found, configuration is merged with defaults. No configuration file is required—if none is found, defaults are used.
Quick Start¶
Configuration Methods¶
Kreuzberg supports four ways to configure extraction:
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: 'tesseract-wasm',
language: 'eng'
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
Configuration Discovery¶
flowchart TD
Start[ExtractionConfig.discover] --> Current{Check Current Directory}
Current -->|Found| LoadCurrent[Load ./kreuzberg.*]
Current -->|Not Found| User{Check User Config}
User -->|Found| LoadUser[Load ~/.config/kreuzberg/config.*]
User -->|Not Found| System{Check System Config}
System -->|Found| LoadSystem[Load /etc/kreuzberg/config.*]
System -->|Not Found| Default[Use Default Config]
LoadCurrent --> Merge[Merge with Defaults]
LoadUser --> Merge
LoadSystem --> Merge
Default --> Return[Return Config]
Merge --> Return
style LoadCurrent fill:#90EE90
style LoadUser fill:#87CEEB
style LoadSystem fill:#FFD700
style Default fill:#FFB6C1 Kreuzberg automatically discovers configuration files in the following locations (in order):
- Current directory:
./kreuzberg.{toml,yaml,yml,json} - User config:
~/.config/kreuzberg/config.{toml,yaml,yml,json} - System config:
/etc/kreuzberg/config.{toml,yaml,yml,json}
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: 'tesseract-wasm',
language: 'eng'
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
console.log(result.content);
Common Use Cases¶
Basic Configuration¶
Get started with minimal configuration:
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
useCache := true
enableQP := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
UseCache: &useCache,
EnableQualityProcessing: &enableQP,
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
Setting Up OCR¶
Enable OCR for scanned documents and images:
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import dev.kreuzberg.config.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+fra")
.tesseractConfig(TesseractConfig.builder()
.psm(3)
.build())
.build())
.build();
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract", language="eng+fra",
tesseract_config=TesseractConfig(psm=3)
)
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: Some("eng+fra".to_string()),
tesseract_config: Some(TesseractConfig {
psm: 3,
..Default::default()
}),
}),
..Default::default()
};
}
For more OCR options, see Tesseract Configuration in the reference.
Chunking for RAG¶
Configure text chunking for vector database ingestion:
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false
}
}
};
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
var chunkId = $"doc_chunk_{index}";
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
if (chunk.Embedding != null)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
}
}
internal static class EnumerableExtensions
{
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
this IEnumerable<T> items)
{
var index = 0;
foreach (var item in items)
{
yield return (index++, item);
}
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
maxChars := 1000
maxOverlap := 200
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for i, chunk := range result.Chunks {
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import dev.kreuzberg.config.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1500)
.maxOverlap(200)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("text-embedding-all-minilm-l6-v2")
.build())
.build())
.build())
.build();
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_chars: Some(1500),
max_overlap: Some(200),
embedding: Some(EmbeddingConfig {
model: Some(EmbeddingModelType {
r#type: "preset".to_string(),
name: Some("text-embedding-all-minilm-l6-v2".to_string()),
..Default::default()
}),
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
File Format Examples¶
TOML (Recommended)¶
use_cache = true
enable_quality_processing = true
[ocr]
backend = "tesseract"
language = "eng"
[ocr.tesseract_config]
psm = 3
YAML¶
use_cache: true
enable_quality_processing: true
ocr:
backend: tesseract
language: eng
tesseract_config:
psm: 3
JSON¶
{
"use_cache": true,
"enable_quality_processing": true,
"ocr": {
"backend": "tesseract",
"language": "eng",
"tesseract_config": {
"psm": 3
}
}
}
Configuration Field Reference¶
For complete documentation of all configuration fields, see Configuration Reference.
Key sections include: - ExtractionConfig - Main configuration - OcrConfig - OCR options - TesseractConfig - Fine-grained OCR tuning - ChunkingConfig - Text chunking - TokenReductionConfig - Token optimization - PageConfig - Page tracking - All other configs - Complete field list