Configuration Guide v4.0.0¶
All extraction behavior is controlled through ExtractionConfig. Pass it directly in code or load it from a TOML/YAML/JSON file. Every field is optional. For per-field documentation, see the Configuration Reference.
Quick Start¶
Rust
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
println!("{}", result.content);
Ok(())
}
Go
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
useCache := true
enableQP := true
result, err := kreuzberg.ExtractFileSync("document.pdf", &kreuzberg.ExtractionConfig{
UseCache: &useCache,
EnableQualityProcessing: &enableQP,
})
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println("content length:", len(result.Content))
}
R
library(kreuzberg)
config <- list(
output_format = "markdown"
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
Configuration Files¶
Three formats are supported. TOML is recommended.
Automatic Discovery¶
When no --config path is supplied, Kreuzberg walks up from the current working directory looking for kreuzberg.toml and uses the first match. YAML and JSON files are supported only when passed explicitly via --config. If nothing is found, defaults are used.
Python
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
content_preview: str = content[:100]
print(f"Content preview: {content_preview}")
print(f"Total length: {len(content)}")
asyncio.run(main())
Go
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
config, err := kreuzberg.LoadExtractionConfigFromFile("")
if err != nil {
log.Fatalf("discover config failed: %v", err)
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Printf("Content length: %d", len(result.Content))
}
R
library(kreuzberg)
# Load configuration from a JSON file and pass it to extract_file_sync.
config_json <- paste(readLines("kreuzberg.json"), collapse = "\n")
config <- ExtractionConfig$from_json(config_json)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
WASM
import { initWasm, extractBytes } from "@kreuzberg/wasm";
await initWasm();
const config = {
use_cache: true,
enable_quality_processing: true,
ocr: {
backend: "tesseract-wasm",
language: "eng",
},
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, "application/pdf", config);
console.log(result.content);
Common Use Cases¶
Setting Up OCR¶
Python
import asyncio
from kreuzberg import ExtractionConfig, OcrConfig, TesseractConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(
backend="tesseract", language="eng+fra",
tesseract_config=TesseractConfig(psm=3)
)
)
result = await extract_file("document.pdf", config=config)
print(result.content)
asyncio.run(main())
Rust
use kreuzberg::{ExtractionConfig, OcrConfig, TesseractConfig};
fn main() {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng+fra".to_string(),
tesseract_config: Some(TesseractConfig {
psm: 3,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
}
Java
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import dev.kreuzberg.TesseractConfig;
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+fra")
.tesseractConfig(TesseractConfig.builder()
.psm(3)
.build())
.build())
.build();
R
library(kreuzberg)
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng")
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted content length: %d\n", nchar(result$content)))
cat(sprintf("Detected language: %s\n", result$detected_language))
For backend selection and language packs, see OCR Guide. For fine-grained Tesseract tuning, see TesseractConfig Reference.
Chunking for RAG¶
Rust
use kreuzberg::{ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig};
fn main() {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1500,
overlap: 200,
embedding: Some(EmbeddingConfig {
model: EmbeddingModelType::Preset {
name: "text-embedding-all-minilm-l6-v2".to_string(),
},
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
println!("{:?}", config.chunking);
}
Go
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
maxChars := 1000
maxOverlap := 200
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
for i, chunk := range result.Chunks {
fmt.Printf("Chunk %d/%d (%d-%d)\n", i+1, chunk.Metadata.TotalChunks, chunk.Metadata.CharStart, chunk.Metadata.CharEnd)
fmt.Printf("%s...\n", chunk.Content[:min(len(chunk.Content), 100)])
}
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
Java
import dev.kreuzberg.ChunkingConfig;
import dev.kreuzberg.EmbeddingConfig;
import dev.kreuzberg.EmbeddingModelType;
import dev.kreuzberg.ExtractionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1500)
.maxOverlap(200)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.builder()
.type("preset")
.name("text-embedding-all-minilm-l6-v2")
.build())
.build())
.build())
.build();
C#
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false
}
}
};
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
var chunkId = $"doc_chunk_{index}";
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
if (chunk.Embedding != null)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
}
}
internal static class EnumerableExtensions
{
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
this IEnumerable<T> items)
{
var index = 0;
foreach (var item in items)
{
yield return (index++, item);
}
}
}
R
library(kreuzberg)
config <- list(
chunking = list(max_characters = 1000L, overlap = 200L)
)
json <- extract_file_sync("document.pdf", "application/pdf", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
for (i in seq_len(min(5L, length(result$chunks)))) {
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
}
All Configuration Categories¶
- ExtractionConfig — top-level options
- OcrConfig — OCR backend, language, acceleration
- TesseractConfig — Tesseract PSM, confidence, table detection
- ChunkingConfig — chunk size, overlap
- TokenReductionConfig — LLM prompt token reduction
- ContentFilterConfig — header/footer/watermark filtering
- PageConfig — page tracking and markers
- AccelerationConfig — ONNX Runtime execution provider
Next Steps¶
- Extraction Basics — core extraction API and supported formats
- OCR Guide — backend installation and language setup
- Advanced Features — embeddings, language detection, page tracking
- Plugins Guide — custom post-processors and validators