Advanced Features¶
Text Chunking¶
Split extracted text into chunks for RAG systems, vector databases, or LLM context windows. Two strategies: Text (splits on whitespace/punctuation boundaries) and Markdown (structure-aware, preserves headings, lists, code blocks).
Configuration¶
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1000,
max_overlap=200,
)
)
result = await extract_file("document.pdf", config=config)
print(f"Chunks: {len(result.chunks or [])}")
for chunk in result.chunks or []:
print(f"Length: {len(chunk.content)}")
asyncio.run(main())
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
sizing_type="tokenizer",
sizing_model="Xenova/gpt-4o",
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
heading_context = chunk.metadata.get("heading_context")
if heading_context:
headings = heading_context.get("headings", [])
for h in headings:
print(f"Heading L{h['level']}: {h['text']}")
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
import asyncio
from kreuzberg import ExtractionConfig, ChunkingConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
chunker_type="markdown",
max_chars=500,
max_overlap=50,
prepend_heading_context=True,
)
)
result = await extract_file("document.md", config=config)
for chunk in result.chunks or []:
# Each chunk's content is prefixed with its heading breadcrumb
print(f"Content: {chunk.content[:100]}...")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
chunking: {
maxChars: 1000,
maxOverlap: 200,
},
};
const result = await extractFile('document.pdf', null, config);
console.log(`Total chunks: ${result.chunks?.length ?? 0}`);
import { extractFile } from '@kreuzberg/node';
const config = {
chunking: {
chunkerType: 'markdown',
maxChars: 500,
maxOverlap: 50,
sizingType: 'tokenizer',
sizingModel: 'Xenova/gpt-4o',
},
};
const result = await extractFile('document.md', null, config);
for (const chunk of result.chunks ?? []) {
const headings = chunk.metadata?.headingContext?.headings ?? [];
for (const heading of headings) {
console.log(`Heading L${heading.level}: ${heading.text}`);
}
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
import { extractFile } from '@kreuzberg/node';
const config = {
chunking: {
chunkerType: 'markdown',
maxChars: 500,
maxOverlap: 50,
prependHeadingContext: true,
},
};
const result = await extractFile('document.md', null, config);
for (const chunk of result.chunks ?? []) {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
}
use kreuzberg::{ExtractionConfig, ChunkingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1000,
overlap: 200,
embedding: None,
}),
..Default::default()
};
use kreuzberg::{ExtractionConfig, ChunkingConfig, ChunkerType};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
chunker_type: ChunkerType::Markdown,
prepend_heading_context: true,
..Default::default()
}),
..Default::default()
};
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
maxChars := 1000
maxOverlap := 200
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
},
}
fmt.Printf("Config: MaxChars=%d, MaxOverlap=%d\n", *config.Chunking.MaxChars, *config.Chunking.MaxOverlap)
}
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
maxChars := 500
maxOverlap := 50
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Sizing: &kreuzberg.ChunkSizingConfig{
Type: "tokenizer",
Model: "Xenova/gpt-4o",
},
},
}
result, err := kreuzberg.ExtractFile("document.md", nil, config)
if err != nil {
panic(err)
}
for _, chunk := range result.Chunks {
if chunk.Metadata != nil && chunk.Metadata.HeadingContext != nil {
for _, heading := range chunk.Metadata.HeadingContext.Headings {
fmt.Printf("Heading L%d: %s\n", heading.Level, heading.Text)
}
}
fmt.Printf("Content: %.100s...\n", chunk.Content)
}
}
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func boolPtr(b bool) *bool { return &b }
func main() {
maxChars := 500
maxOverlap := 50
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
PrependHeadingContext: boolPtr(true),
},
}
result, err := kreuzberg.ExtractFile("document.md", nil, config)
if err != nil {
panic(err)
}
for _, chunk := range result.Chunks {
// Each chunk's content is prefixed with its heading breadcrumb
fmt.Printf("Content: %.100s...\n", chunk.Content)
}
}
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(200)
.build())
.build();
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.HeadingContext;
import dev.kreuzberg.HeadingLevel;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkerType("markdown")
.maxChars(500)
.maxOverlap(50)
.sizingTokenizer("Xenova/gpt-4o")
.build())
.build();
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
result.getChunks().forEach(chunk -> {
var headingContext = chunk.getMetadata().getHeadingContext();
if (headingContext.isPresent()) {
System.out.println("Headings:");
headingContext.get().getHeadings().forEach(heading ->
System.out.println(" Level " + heading.getLevel() + ": " + heading.getText())
);
}
});
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.chunkerType("markdown")
.maxChars(500)
.maxOverlap(50)
.prependHeadingContext(true)
.build())
.build();
ExtractionResult result = KreuzbergClient.extractFile("document.md", config);
result.getChunks().forEach(chunk -> {
// Each chunk's content is prefixed with its heading breadcrumb
System.out.println(chunk.getContent().substring(0, Math.min(100, chunk.getContent().length())));
});
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 1000,
MaxOverlap = 200,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-minilm-l6-v2"),
Normalize = true,
BatchSize = 32
}
}
};
try
{
var result = await KreuzbergClient.ExtractFileAsync(
"document.pdf",
config
).ConfigureAwait(false);
Console.WriteLine($"Chunks: {result.Chunks.Count}");
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Content length: {chunk.Content.Length}");
if (chunk.Embedding != null)
{
Console.WriteLine($"Embedding dimensions: {chunk.Embedding.Length}");
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Sizing = new ChunkSizingConfig
{
Type = "tokenizer",
Model = "Xenova/gpt-4o"
}
}
};
try
{
var result = await KreuzbergClient.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
if (chunk.HeadingContext?.Headings != null)
{
Console.WriteLine("Headings:");
foreach (var heading in chunk.HeadingContext.Headings)
{
Console.WriteLine($" Level {heading.Level}: {heading.Text}");
}
}
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
PrependHeadingContext = true
}
};
try
{
var result = await KreuzbergClient.ExtractFileAsync(
"document.md",
config
).ConfigureAwait(false);
foreach (var chunk in result.Chunks)
{
// Each chunk's content is prefixed with its heading breadcrumb
Console.WriteLine(chunk.Content[..Math.Min(100, chunk.Content.Length)]);
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
}
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
max_characters: 1000,
overlap: 200
)
)
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
sizing_type: "tokenizer",
sizing_model: "Xenova/gpt-4o"
)
)
result = Kreuzberg.extract_file("document.md", config)
result.chunks.each do |chunk|
if chunk.metadata.heading_context
puts "Headings:"
chunk.metadata.heading_context.headings.each do |heading|
puts " #{' ' * (heading.level - 1) * 2}Level #{heading.level}: #{heading.text}"
end
end
end
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
chunker_type: "markdown",
max_characters: 500,
overlap: 50,
prepend_heading_context: true
)
)
result = Kreuzberg.extract_file("document.md", config)
result.chunks.each do |chunk|
# Each chunk's content is prefixed with its heading breadcrumb
puts chunk.content[0, 100]
end
library(kreuzberg)
# Example 1: Basic character-based chunking
chunking_cfg <- chunking_config(max_characters = 1000L, overlap = 200L)
config <- extraction_config(chunking = chunking_cfg)
result <- extract_file_sync("document.pdf", "application/pdf", config)
num_chunks <- length(result$chunks)
cat(sprintf("Document split into %d chunks\n", num_chunks))
for (i in seq_len(min(3L, num_chunks))) {
cat(sprintf("Chunk %d: %d characters\n", i, nchar(result$chunks[[i]])))
}
# Example 2: Markdown chunker with token-based sizing and heading context
chunking_cfg2 <- chunking_config(
chunker_type = "markdown",
sizing = list(
type = "tokenizer",
model = "Xenova/gpt-4o"
)
)
config2 <- extraction_config(chunking = chunking_cfg2)
result2 <- extract_file_sync("document.md", "text/markdown", config2)
num_chunks2 <- length(result2$chunks)
cat(sprintf("\nMarkdown document split into %d chunks\n", num_chunks2))
for (i in seq_len(min(3L, num_chunks2))) {
chunk <- result2$chunks[[i]]
cat(sprintf("\nChunk %d:\n", i))
cat(sprintf(" Preview: %s...\n", substr(chunk$text, 1, 60)))
# Access heading context
if (!is.null(chunk$metadata$heading_context)) {
headings <- chunk$metadata$heading_context$headings
if (length(headings) > 0) {
cat(" Headings in context:\n")
for (h in headings) {
cat(sprintf(" - Level %d: %s\n", h$level, h$text))
}
}
}
}
# Example 3: Prepend heading context to chunk content
chunking_cfg3 <- chunking_config(
chunker_type = "markdown",
prepend_heading_context = TRUE
)
config3 <- extraction_config(chunking = chunking_cfg3)
result3 <- extract_file_sync("document.md", "text/markdown", config3)
num_chunks3 <- length(result3$chunks)
cat(sprintf("\nDocument split into %d chunks with prepended headings\n", num_chunks3))
for (i in seq_len(min(3L, num_chunks3))) {
chunk <- result3$chunks[[i]]
# Each chunk's content is prefixed with its heading breadcrumb
cat(sprintf("Chunk %d: %s...\n", i, substr(chunk$content, 1, 80)))
}
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
chunking: {
maxChars: 1000,
chunkOverlap: 100
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf', config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
console.log(`Tokens: ${chunk.metadata?.token_count}`);
});
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
chunking: {
chunkerType: 'markdown',
maxChars: 2000
// Note: Token-based sizing is not available in WASM builds.
// Use character-based sizing instead.
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'text/markdown', config);
result.chunks?.forEach((chunk, idx) => {
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 50)}...`);
if (chunk.metadata?.headingContext?.headings) {
console.log('Headings:');
chunk.metadata.headingContext.headings.forEach(h => {
console.log(` Level ${h.level}: ${h.text}`);
});
}
});
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const config = {
chunking: {
chunkerType: 'markdown',
maxChars: 2000,
prependHeadingContext: true,
}
};
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'text/markdown', config);
result.chunks?.forEach((chunk, idx) => {
// Each chunk's content is prefixed with its heading breadcrumb
console.log(`Chunk ${idx}: ${chunk.content.substring(0, 80)}...`);
});
Chunk Output¶
Each chunk in result.chunks contains:
| Field | Description |
|---|---|
content |
Chunk text |
metadata.byte_start / byte_end |
Byte offsets in the original text |
metadata.chunk_index / total_chunks |
Position in sequence |
metadata.token_count |
Token count (when embeddings enabled) |
metadata.heading_context |
Active heading hierarchy (Markdown chunker only) |
embedding |
Embedding vector (when configured) |
Chunks can be sized by token count instead of characters — enable the chunking-tokenizers feature and set sizing to token.
RAG Pipeline Example¶
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=500,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
normalize=True,
batch_size=16
)
)
)
result = await extract_file("research_paper.pdf", config=config)
chunks_with_embeddings: list = []
for chunk in result.chunks or []:
if chunk.embedding:
chunks_with_embeddings.append({
"content": chunk.content[:100],
"embedding_dims": len(chunk.embedding)
})
print(f"Chunks with embeddings: {len(chunks_with_embeddings)}")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
chunking: {
maxChars: 500,
maxOverlap: 50,
embedding: {
preset: 'balanced',
},
},
};
const result = await extractFile('research_paper.pdf', null, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk ${chunk.metadata.chunkIndex + 1}/${chunk.metadata.totalChunks}`);
console.log(`Position: ${chunk.metadata.charStart}-${chunk.metadata.charEnd}`);
console.log(`Content: ${chunk.content.slice(0, 100)}...`);
if (chunk.embedding) {
console.log(`Embedding: ${chunk.embedding.length} dimensions`);
}
}
}
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 500,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: "balanced".to_string(),
normalize: true,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
println!("Chunk {}/{}",
chunk.metadata.chunk_index + 1,
chunk.metadata.total_chunks
);
println!("Position: {}-{}",
chunk.metadata.byte_start,
chunk.metadata.byte_end
);
println!("Content: {}...", &chunk.content[..100.min(chunk.content.len())]);
if let Some(embedding) = chunk.embedding {
println!("Embedding: {} dimensions", embedding.len());
}
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
maxChars := 500
maxOverlap := 50
normalize := true
batchSize := int32(16)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("all-mpnet-base-v2"),
Normalize: &normalize,
BatchSize: &batchSize,
},
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("RAG extraction failed: %v", err)
}
chunks := result.Chunks
fmt.Printf("Found %d chunks for RAG pipeline\n", len(chunks))
for i := 0; i < len(chunks) && i < 3; i++ {
chunk := chunks[i]
content := chunk.Content
if len(content) > 80 {
content = content[:80]
}
fmt.Printf("Chunk %d: %s...\n", i, content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(500)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("all-mpnet-base-v2"))
.normalize(true)
.batchSize(16)
.build())
.build())
.build();
try {
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
System.out.println("Found " + chunks.size() + " chunks for RAG pipeline");
for (int i = 0; i < Math.min(3, chunks.size()); i++) {
Object chunk = chunks.get(i);
System.out.println("Chunk " + i + ": " + chunk.toString().substring(0, Math.min(80, chunk.toString().length())) + "...");
}
} catch (Exception ex) {
System.err.println("RAG extraction failed: " + ex.getMessage());
}
using Kreuzberg;
using System.Collections.Generic;
using System.Linq;
class RagPipelineExample
{
static async Task Main()
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 500,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("all-mpnet-base-v2"),
Normalize = true,
BatchSize = 16
}
}
};
try
{
var result = await KreuzbergClient.ExtractFileAsync(
"research_paper.pdf",
config
).ConfigureAwait(false);
var vectorStore = await BuildVectorStoreAsync(result.Chunks)
.ConfigureAwait(false);
var query = "machine learning optimization";
var relevantChunks = await SearchAsync(vectorStore, query)
.ConfigureAwait(false);
Console.WriteLine($"Found {relevantChunks.Count} relevant chunks");
foreach (var chunk in relevantChunks.Take(3))
{
Console.WriteLine($"Content: {chunk.Content[..80]}...");
Console.WriteLine($"Similarity: {chunk.Similarity:F3}\n");
}
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Error: {ex.Message}");
}
}
static async Task<List<VectorEntry>> BuildVectorStoreAsync(
IEnumerable<Chunk> chunks)
{
return await Task.Run(() =>
{
return chunks.Select(c => new VectorEntry
{
Content = c.Content,
Embedding = c.Embedding?.ToArray() ?? Array.Empty<float>(),
Similarity = 0f
}).ToList();
}).ConfigureAwait(false);
}
static async Task<List<VectorEntry>> SearchAsync(
List<VectorEntry> store,
string query)
{
return await Task.Run(() =>
{
return store
.OrderByDescending(e => e.Similarity)
.ToList();
}).ConfigureAwait(false);
}
class VectorEntry
{
public string Content { get; set; } = string.Empty;
public float[] Embedding { get; set; } = Array.Empty<float>();
public float Similarity { get; set; }
}
}
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
max_characters: 500,
overlap: 50,
embedding: Kreuzberg::Config::Embedding.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'all-mpnet-base-v2'
),
normalize: true,
batch_size: 16
)
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
vector_store = build_vector_store(result.chunks)
query = 'machine learning optimization'
relevant_chunks = search_vector_store(vector_store, query)
puts "Found #{relevant_chunks.length} relevant chunks"
relevant_chunks.take(3).each do |chunk|
puts "Content: #{chunk[:content][0..80]}..."
puts "Similarity: #{chunk[:similarity]&.round(3)}\n"
end
def build_vector_store(chunks)
chunks.map.with_index do |chunk, idx|
{
id: idx,
content: chunk.content,
embedding: chunk.embedding,
similarity: 0.0
}
end
end
def search_vector_store(store, query)
store.sort_by { |entry| entry[:similarity] }.reverse
end
library(kreuzberg)
chunking_cfg <- chunking_config(max_characters = 800L, overlap = 150L)
config <- extraction_config(chunking = chunking_cfg)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Total chunks: %d\n", length(result$chunks)))
cat(sprintf("Processing chunks for RAG pipeline:\n"))
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
cat(sprintf("Chunk %d: %d characters\n", i, nchar(chunk)))
}
Language Detection¶
Detect languages using whatlang. Supports 60+ languages with ISO 639-3 codes. Set detect_multiple: true to detect all languages in a document (chunks text into 200-char segments, returns languages sorted by frequency).
Configuration¶
import asyncio
from kreuzberg import ExtractionConfig, LanguageDetectionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.85,
detect_multiple=False
)
)
result = await extract_file("document.pdf", config=config)
if result.detected_languages:
print(f"Primary language: {result.detected_languages[0]}")
print(f"Content length: {len(result.content)} chars")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: false,
},
};
const result = await extractFile('document.pdf', null, config);
if (result.detectedLanguages) {
console.log(`Detected languages: ${result.detectedLanguages.join(', ')}`);
}
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: true,
MinConfidence: &minConfidence,
DetectMultiple: false,
},
}
fmt.Printf("Language detection enabled: %v\n", config.LanguageDetection.Enabled)
fmt.Printf("Min confidence: %f\n", *config.LanguageDetection.MinConfidence)
}
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = false
}
};
try
{
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Detected Language: {result.DetectedLanguages[0]}");
}
else
{
Console.WriteLine("No language detected");
}
Console.WriteLine($"Content length: {result.Content.Length} characters");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Extraction failed: {ex.Message}");
}
}
}
Multilingual Example¶
import asyncio
from kreuzberg import extract_file, ExtractionConfig, LanguageDetectionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
language_detection=LanguageDetectionConfig(
enabled=True,
min_confidence=0.7,
detect_multiple=True
)
)
result = await extract_file("multilingual_document.pdf", config=config)
languages: list[str] = result.detected_languages or []
print(f"Detected {len(languages)} languages: {languages}")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
languageDetection: {
enabled: true,
minConfidence: 0.8,
detectMultiple: true,
},
};
const result = await extractFile('multilingual_document.pdf', null, config);
if (result.detectedLanguages) {
console.log(`Detected languages: ${result.detectedLanguages.join(', ')}`);
}
use kreuzberg::{extract_file, ExtractionConfig, LanguageDetectionConfig};
let config = ExtractionConfig {
language_detection: Some(LanguageDetectionConfig {
enabled: true,
min_confidence: 0.8,
detect_multiple: true,
}),
..Default::default()
};
let result = extract_file("multilingual_document.pdf", None, &config).await?;
println!("Detected languages: {:?}", result.detected_languages);
package main
import (
"fmt"
"log"
"strings"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
enabled := true
detectMultiple := true
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &enabled,
MinConfidence: &minConfidence,
DetectMultiple: &detectMultiple,
},
}
result, err := kreuzberg.ExtractFileSync("multilingual_document.pdf", config)
if err != nil {
log.Fatalf("Processing failed: %v", err)
}
languages := result.DetectedLanguages
if len(languages) > 0 {
fmt.Printf("Detected %d language(s): %s\n", len(languages), strings.Join(languages, ", "))
} else {
fmt.Println("No languages detected")
}
fmt.Printf("Total content: %d characters\n", len(result.Content))
fmt.Printf("MIME type: %s\n", result.MimeType)
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.LanguageDetectionConfig;
import java.math.BigDecimal;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(new BigDecimal("0.8"))
.detectMultiple(true)
.build())
.build();
try {
ExtractionResult result = Kreuzberg.extractFile("multilingual_document.pdf", config);
List<String> languages = result.getDetectedLanguages() != null
? result.getDetectedLanguages()
: List.of();
if (!languages.isEmpty()) {
System.out.println("Detected " + languages.size() + " language(s): " + String.join(", ", languages));
} else {
System.out.println("No languages detected");
}
System.out.println("Total content: " + result.getContent().length() + " characters");
System.out.println("MIME type: " + result.getMimeType());
} catch (Exception ex) {
System.err.println("Processing failed: " + ex.getMessage());
}
using Kreuzberg;
class Program
{
static async Task Main()
{
var config = new ExtractionConfig
{
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
MinConfidence = 0.8m,
DetectMultiple = true
}
};
try
{
var result = await KreuzbergClient.ExtractFileAsync("multilingual_document.pdf", config);
var languages = result.DetectedLanguages ?? new List<string>();
if (languages.Count > 0)
{
Console.WriteLine($"Detected {languages.Count} language(s): {string.Join(", ", languages)}");
}
else
{
Console.WriteLine("No languages detected");
}
Console.WriteLine($"Total content: {result.Content.Length} characters");
Console.WriteLine($"MIME type: {result.MimeType}");
}
catch (KreuzbergException ex)
{
Console.WriteLine($"Processing failed: {ex.Message}");
}
}
}
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
language_detection: Kreuzberg::Config::LanguageDetection.new(
enabled: true,
min_confidence: 0.8,
detect_multiple: true
)
)
result = Kreuzberg.extract_file_sync('multilingual_document.pdf', config: config)
languages = result.detected_languages || []
if languages.any?
puts "Detected #{languages.length} language(s): #{languages.join(', ')}"
else
puts "No languages detected"
end
puts "Total content: #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
library(kreuzberg)
files <- c("english.pdf", "spanish.pdf", "french.pdf")
config <- extraction_config(language_detection = list(enabled = TRUE))
for (file in files) {
result <- extract_file_sync(file, "application/pdf", config)
cat(sprintf("%s: detected language = %s\n",
file, result$detected_language))
}
Embedding Generation¶
Generate embeddings for semantic search and RAG using ONNX models. Requires the embeddings feature.
| Preset | Model | Dimensions | Max tokens |
|---|---|---|---|
fast |
AllMiniLML6V2Q | 384 | 512 |
balanced |
BGEBaseENV15 | 768 | 1024 |
quality |
BGELargeENV15 | 1024 | 2000 |
multilingual |
MultilingualE5Base | 768 | 1024 |
Configuration¶
from kreuzberg import (
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=1024,
max_overlap=100,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"),
normalize=True,
batch_size=32,
show_download_progress=False,
),
)
)
use kreuzberg::{ExtractionConfig, ChunkingConfig, EmbeddingConfig};
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 1024,
overlap: 100,
embedding: Some(EmbeddingConfig {
model: "balanced".to_string(),
normalize: true,
batch_size: 32,
show_download_progress: false,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
maxChars := 512
maxOverlap := 50
normalize := true
batchSize := int32(32)
showProgress := false
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
Normalize: &normalize,
BatchSize: &batchSize,
ShowDownloadProgress: &showProgress,
},
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
fmt.Printf("Error: %v\n", err)
return
}
for index, chunk := range result.Chunks {
chunkID := fmt.Sprintf("doc_chunk_%d", index)
content := chunk.Content
if len(content) > 50 {
content = content[:50]
}
fmt.Printf("Chunk %s: %s\n", chunkID, content)
if chunk.Embedding != nil && len(chunk.Embedding) > 0 {
fmt.Printf(" Embedding dimensions: %d\n", len(chunk.Embedding))
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import java.util.List;
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.showDownloadProgress(false)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
for (int index = 0; index < chunks.size(); index++) {
Object chunk = chunks.get(index);
String chunkId = "doc_chunk_" + index;
System.out.println("Chunk " + chunkId + ": " + chunk.toString().substring(0, Math.min(50, chunk.toString().length())));
if (chunk instanceof java.util.Map) {
Object embedding = ((java.util.Map<String, Object>) chunk).get("embedding");
if (embedding != null) {
System.out.println(" Embedding dimensions: " + ((float[]) embedding).length);
}
}
}
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32,
ShowDownloadProgress = false
}
}
};
var result = await Kreuzberg.ExtractFileAsync("document.pdf", config);
var chunks = result.Chunks ?? new List<Chunk>();
foreach (var (index, chunk) in chunks.WithIndex())
{
var chunkId = $"doc_chunk_{index}";
Console.WriteLine($"Chunk {chunkId}: {chunk.Content[..Math.Min(50, chunk.Content.Length)]}");
if (chunk.Embedding != null)
{
Console.WriteLine($" Embedding dimensions: {chunk.Embedding.Length}");
}
}
internal static class EnumerableExtensions
{
public static IEnumerable<(int Index, T Item)> WithIndex<T>(
this IEnumerable<T> items)
{
var index = 0;
foreach (var item in items)
{
yield return (index++, item);
}
}
}
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::Config::Embedding.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32,
show_download_progress: false
)
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
chunks = result.chunks || []
chunks.each_with_index do |chunk, idx|
chunk_id = "doc_chunk_#{idx}"
puts "Chunk #{chunk_id}: #{chunk.content[0...50]}"
if chunk.embedding
puts " Embedding dimensions: #{chunk.embedding.length}"
end
end
library(kreuzberg)
chunking_cfg <- chunking_config(max_characters = 1000L, overlap = 200L)
config <- extraction_config(chunking = chunking_cfg)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Preparing %d chunks for embedding:\n", length(result$chunks)))
embeddings_data <- list()
for (i in seq_len(length(result$chunks))) {
embeddings_data[[i]] <- list(
chunk_id = i,
text = result$chunks[[i]],
length = nchar(result$chunks[[i]])
)
}
cat(sprintf("Ready to embed %d chunks\n", length(embeddings_data)))
Vector Database Integration¶
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
chunking=ChunkingConfig(
max_chars=512,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"), normalize=True
),
)
)
result = await extract_file("document.pdf", config=config)
chunks = result.chunks or []
for i, chunk in enumerate(chunks):
chunk_id: str = f"doc_chunk_{i}"
print(f"Chunk {chunk_id}: {chunk.content[:50]}")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
chunking: {
maxChars: 512,
maxOverlap: 50,
embedding: {
preset: 'balanced',
},
},
};
const result = await extractFile('document.pdf', null, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.slice(0, 100)}...`);
if (chunk.embedding) {
console.log(`Embedding dims: ${chunk.embedding.length}`);
}
}
}
use kreuzberg::{extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig};
struct VectorRecord {
id: String,
content: String,
embedding: Vec<f32>,
metadata: std::collections::HashMap<String, String>,
}
async fn extract_and_vectorize(
document_path: &str,
document_id: &str,
) -> Result<Vec<VectorRecord>, Box<dyn std::error::Error>> {
let config = ExtractionConfig {
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset {
name: "balanced".to_string(),
},
normalize: true,
batch_size: 32,
..Default::default()
}),
..Default::default()
}),
..Default::default()
};
let result = extract_file(document_path, None, &config).await?;
let mut records = Vec::new();
if let Some(chunks) = result.chunks {
for (index, chunk) in chunks.iter().enumerate() {
if let Some(embedding) = &chunk.embedding {
let mut metadata = std::collections::HashMap::new();
metadata.insert("document_id".to_string(), document_id.to_string());
metadata.insert("chunk_index".to_string(), index.to_string());
metadata.insert("content_length".to_string(), chunk.content.len().to_string());
records.push(VectorRecord {
id: format!("{}_chunk_{}", document_id, index),
content: chunk.content.clone(),
embedding: embedding.clone(),
metadata,
});
}
}
}
Ok(records)
}
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
type VectorRecord struct {
ID string
Embedding []float32
Content string
Metadata map[string]string
}
func extractAndVectorize(documentPath string, documentID string) ([]VectorRecord, error) {
maxChars := 512
maxOverlap := 50
normalize := true
batchSize := int32(32)
config := &kreuzberg.ExtractionConfig{
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: kreuzberg.EmbeddingModelType_Preset("balanced"),
Normalize: &normalize,
BatchSize: &batchSize,
},
},
}
result, err := kreuzberg.ExtractFileSync(documentPath, config)
if err != nil {
return nil, err
}
var vectorRecords []VectorRecord
for index, chunk := range result.Chunks {
record := VectorRecord{
ID: fmt.Sprintf("%s_chunk_%d", documentID, index),
Content: chunk.Content,
Embedding: chunk.Embedding,
Metadata: map[string]string{
"document_id": documentID,
"chunk_index": fmt.Sprintf("%d", index),
"content_length": fmt.Sprintf("%d", len(chunk.Content)),
},
}
vectorRecords = append(vectorRecords, record)
}
storeInVectorDatabase(vectorRecords)
return vectorRecords, nil
}
func storeInVectorDatabase(records []VectorRecord) {
for _, record := range records {
if len(record.Embedding) > 0 {
fmt.Printf("Storing %s: %d chars, %d dims\n",
record.ID, len(record.Content), len(record.Embedding))
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.EmbeddingConfig;
import dev.kreuzberg.config.EmbeddingModelType;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class VectorDatabaseIntegration {
public static class VectorRecord {
public String id;
public float[] embedding;
public String content;
public Map<String, String> metadata;
}
public static List<VectorRecord> extractAndVectorize(String documentPath, String documentId) throws Exception {
ExtractionConfig config = ExtractionConfig.builder()
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding(EmbeddingConfig.builder()
.model(EmbeddingModelType.preset("balanced"))
.normalize(true)
.batchSize(32)
.build())
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile(documentPath, config);
List<Object> chunks = result.getChunks() != null ? result.getChunks() : List.of();
List<VectorRecord> vectorRecords = new java.util.ArrayList<>();
for (int index = 0; index < chunks.size(); index++) {
VectorRecord record = new VectorRecord();
record.id = documentId + "_chunk_" + index;
record.metadata = new HashMap<>();
record.metadata.put("document_id", documentId);
record.metadata.put("chunk_index", String.valueOf(index));
if (chunk instanceof java.util.Map) {
Map<String, Object> chunkMap = (Map<String, Object>) chunks.get(index);
record.content = (String) chunkMap.get("content");
record.embedding = (float[]) chunkMap.get("embedding");
record.metadata.put("content_length", String.valueOf(record.content.length()));
}
vectorRecords.add(record);
}
storeInVectorDatabase(vectorRecords);
return vectorRecords;
}
private static void storeInVectorDatabase(List<VectorRecord> records) {
for (VectorRecord record : records) {
if (record.embedding != null && record.embedding.length > 0) {
System.out.println("Storing " + record.id + ": " + record.content.length()
+ " chars, " + record.embedding.length + " dims");
}
}
}
}
using Kreuzberg;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
public class VectorDatabaseIntegration
{
public class VectorRecord
{
public string Id { get; set; }
public float[] Embedding { get; set; }
public string Content { get; set; }
public Dictionary<string, string> Metadata { get; set; }
}
public async Task<List<VectorRecord>> ExtractAndVectorize(
string documentPath,
string documentId)
{
var config = new ExtractionConfig
{
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new EmbeddingConfig
{
Model = EmbeddingModelType.Preset("balanced"),
Normalize = true,
BatchSize = 32
}
}
};
var result = await Kreuzberg.ExtractFileAsync(documentPath, config);
var chunks = result.Chunks ?? new List<Chunk>();
var vectorRecords = chunks
.Select((chunk, index) => new VectorRecord
{
Id = $"{documentId}_chunk_{index}",
Content = chunk.Content,
Embedding = chunk.Embedding,
Metadata = new Dictionary<string, string>
{
{ "document_id", documentId },
{ "chunk_index", index.ToString() },
{ "content_length", chunk.Content.Length.ToString() }
}
})
.ToList();
await StoreInVectorDatabase(vectorRecords);
return vectorRecords;
}
private async Task StoreInVectorDatabase(List<VectorRecord> records)
{
foreach (var record in records)
{
if (record.Embedding != null && record.Embedding.Length > 0)
{
Console.WriteLine(
$"Storing {record.Id}: {record.Content.Length} chars, " +
$"{record.Embedding.Length} dims");
}
}
await Task.CompletedTask;
}
}
require 'kreuzberg'
class VectorDatabaseIntegration
VectorRecord = Struct.new(:id, :embedding, :content, :metadata, keyword_init: true)
def extract_and_vectorize(document_path, document_id)
config = Kreuzberg::Config::Extraction.new(
chunking: Kreuzberg::Config::Chunking.new(
max_characters: 512,
overlap: 50,
embedding: Kreuzberg::Config::Embedding.new(
model: Kreuzberg::EmbeddingModelType.new(
type: 'preset',
name: 'balanced'
),
normalize: true,
batch_size: 32
)
)
)
result = Kreuzberg.extract_file_sync(document_path, config: config)
chunks = result.chunks || []
vector_records = chunks.map.with_index do |chunk, idx|
VectorRecord.new(
id: "#{document_id}_chunk_#{idx}",
content: chunk.content,
embedding: chunk.embedding,
metadata: {
document_id: document_id,
chunk_index: idx,
content_length: chunk.content.length
}
)
end
store_in_vector_database(vector_records)
vector_records
end
private
def store_in_vector_database(records)
records.each do |record|
if record.embedding&.any?
puts "Storing #{record.id}: #{record.content.length} chars, #{record.embedding.length} dims"
end
end
end
end
library(kreuzberg)
chunking_cfg <- chunking_config(max_characters = 1000L, overlap = 200L)
config <- extraction_config(chunking = chunking_cfg)
result <- extract_file_sync("document.pdf", "application/pdf", config)
for (i in seq_len(min(3L, length(result$chunks)))) {
chunk <- result$chunks[[i]]
vector_doc <- list(
id = sprintf("doc_%d", i),
text = chunk,
metadata = list(
source = "document.pdf",
chunk_index = i,
length = nchar(chunk)
)
)
cat(sprintf("Vector DB entry %d: %d chars\n", i, nchar(chunk)))
}
Token Reduction¶
Reduce token count while preserving meaning for LLM pipelines.
| Level | Reduction | Effect |
|---|---|---|
off |
0% | Pass-through |
moderate |
15–25% | Stopwords + redundancy removal |
aggressive |
30–50% | Semantic clustering + importance scoring |
Configuration¶
use kreuzberg::{ExtractionConfig, TokenReductionConfig};
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
preserve_code: true,
language_hint: Some("eng".to_string()),
..Default::default()
}),
..Default::default()
};
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
config := &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: "moderate",
PreserveImportantWords: kreuzberg.BoolPtr(true),
},
}
fmt.Printf("Mode: %s, Preserve Important Words: %v\n",
config.TokenReduction.Mode,
*config.TokenReduction.PreserveImportantWords)
}
library(kreuzberg)
config <- extraction_config(
token_reduction = list(enabled = TRUE)
)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Original content length: %d characters\n", nchar(result$content)))
cat(sprintf("Content preview: %.60s...\n", result$content))
Example¶
import asyncio
from kreuzberg import extract_file, ExtractionConfig, TokenReductionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
token_reduction=TokenReductionConfig(
mode="moderate", preserve_important_words=True
)
)
result = await extract_file("verbose_document.pdf", config=config)
original: int = result.metadata.get("original_token_count", 0)
reduced: int = result.metadata.get("token_count", 0)
ratio: float = result.metadata.get("token_reduction_ratio", 0.0)
print(f"Reduced from {original} to {reduced} tokens")
print(f"Reduction: {ratio * 100:.1f}%")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
tokenReduction: {
mode: 'moderate',
preserveImportantWords: true,
},
};
const result = await extractFile('verbose_document.pdf', null, config);
console.log(`Content length: ${result.content.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
use kreuzberg::{extract_file, ExtractionConfig, TokenReductionConfig};
let config = ExtractionConfig {
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
..Default::default()
}),
..Default::default()
};
let result = extract_file("verbose_document.pdf", None, &config).await?;
if let Some(original) = result.original_token_count {
println!("Original tokens: {}", original);
}
if let Some(reduced) = result.reduced_token_count {
println!("Reduced tokens: {}", reduced);
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
preserveMarkdown := true
mode := "moderate"
config := &kreuzberg.ExtractionConfig{
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: &mode,
PreserveMarkdown: &preserveMarkdown,
},
}
result, err := kreuzberg.ExtractFileSync("verbose_document.pdf", config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
original := 0
reduced := 0
ratio := 0.0
if val, ok := result.Metadata["original_token_count"]; ok {
original = val.(int)
}
if val, ok := result.Metadata["token_count"]; ok {
reduced = val.(int)
}
if val, ok := result.Metadata["token_reduction_ratio"]; ok {
ratio = val.(float64)
}
fmt.Printf("Reduced from %d to %d tokens\n", original, reduced)
fmt.Printf("Reduction: %.1f%%\n", ratio*100)
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.TokenReductionConfig;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveMarkdown(true)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("verbose_document.pdf", config);
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
int original = metadata.containsKey("original_token_count")
? ((Number) metadata.get("original_token_count")).intValue()
: 0;
int reduced = metadata.containsKey("token_count")
? ((Number) metadata.get("token_count")).intValue()
: 0;
double ratio = metadata.containsKey("token_reduction_ratio")
? ((Number) metadata.get("token_reduction_ratio")).doubleValue()
: 0.0;
System.out.println("Reduced from " + original + " to " + reduced + " tokens");
System.out.println(String.format("Reduction: %.1f%%", ratio * 100));
using Kreuzberg;
var config = new ExtractionConfig
{
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveMarkdown = true
}
};
var result = await KreuzbergClient.ExtractFileAsync(
"verbose_document.pdf",
config
);
var original = result.Metadata.ContainsKey("original_token_count")
? (int)result.Metadata["original_token_count"]
: 0;
var reduced = result.Metadata.ContainsKey("token_count")
? (int)result.Metadata["token_count"]
: 0;
var ratio = result.Metadata.ContainsKey("token_reduction_ratio")
? (double)result.Metadata["token_reduction_ratio"]
: 0.0;
Console.WriteLine($"Reduced from {original} to {reduced} tokens");
Console.WriteLine($"Reduction: {ratio * 100:F1}%");
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
token_reduction: Kreuzberg::Config::TokenReduction.new(
mode: 'moderate',
preserve_markdown: true
)
)
result = Kreuzberg.extract_file_sync('verbose_document.pdf', config: config)
original_tokens = result.metadata&.dig('original_token_count') || 0
reduced_tokens = result.metadata&.dig('token_count') || 0
reduction_ratio = result.metadata&.dig('token_reduction_ratio') || 0.0
puts "Reduced from #{original_tokens} to #{reduced_tokens} tokens"
puts "Reduction: #{(reduction_ratio * 100).round(1)}%"
library(kreuzberg)
config <- extraction_config(
token_reduction = list(enabled = TRUE)
)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Token-reduced content:\n"))
cat(sprintf("Length: %d characters\n", nchar(result$content)))
cat(sprintf("Preview: %.60s...\n", result$content))
Keyword Extraction¶
Extract keywords using YAKE or RAKE algorithms. Requires the keywords feature flag. See Keyword Extraction for algorithm details and parameter reference.
Configuration¶
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
ngram_range=(1, 3),
language="en"
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content extracted: {len(result.content)} chars")
asyncio.run(main())
use kreuzberg::{ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
..Default::default()
};
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
MinScore: 0.3,
NgramRange: "1,3",
Language: "en",
},
}
fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
config.Keywords.Algorithm,
config.Keywords.MaxKeywords,
config.Keywords.MinScore)
}
library(kreuzberg)
config <- extraction_config(
keywords = list(enabled = TRUE)
)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
if (length(result$keywords) > 0) {
for (i in seq_len(min(5L, length(result$keywords)))) {
cat(sprintf(" - %s\n", result$keywords[[i]]))
}
}
Example¶
import asyncio
from kreuzberg import extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3
)
)
result = await extract_file("research_paper.pdf", config=config)
keywords: list = result.extracted_keywords or []
for kw in keywords:
score: float = kw.score or 0.0
text: str = kw.text or ""
print(f"{text}: {score:.3f}")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
keywords: {
algorithm: 'yake',
maxKeywords: 10,
minScore: 0.3,
},
};
const result = await extractFile('research_paper.pdf', null, config);
console.log(`Content length: ${result.content.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(keywords) = &result.extracted_keywords {
println!("Keywords: {:?}", keywords);
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
maxKeywords := int32(10)
minScore := 0.3
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: kreuzberg.KeywordAlgorithm_YAKE,
MaxKeywords: &maxKeywords,
MinScore: &minScore,
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
if keywords, ok := result.Metadata["keywords"]; ok {
keywordList := keywords.([]map[string]interface{})
for _, kw := range keywordList {
text := kw["text"].(string)
score := kw["score"].(float64)
fmt.Printf("%s: %.3f\n", text, score)
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.KeywordConfig;
import dev.kreuzberg.config.KeywordAlgorithm;
import java.util.List;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.keywords(KeywordConfig.builder()
.algorithm(KeywordAlgorithm.YAKE)
.maxKeywords(10)
.minScore(0.3)
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("research_paper.pdf", config);
Map<String, Object> metadata = result.getMetadata() != null ? result.getMetadata() : Map.of();
if (metadata.containsKey("keywords")) {
List<Map<String, Object>> keywords = (List<Map<String, Object>>) metadata.get("keywords");
for (Map<String, Object> kw : keywords) {
String text = (String) kw.get("text");
Double score = ((Number) kw.get("score")).doubleValue();
System.out.println(text + ": " + String.format("%.3f", score));
}
}
using Kreuzberg;
using System.Collections.Generic;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.3
}
};
var result = await KreuzbergClient.ExtractFileAsync(
"research_paper.pdf",
config
);
if (result.Metadata.ContainsKey("keywords"))
{
var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
foreach (var kw in keywords)
{
var text = (string)kw["text"];
var score = (double)kw["score"];
Console.WriteLine($"{text}: {score:F3}");
}
}
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
keywords: Kreuzberg::Config::Keywords.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
keywords = result.metadata&.dig('keywords') || []
keywords.each do |kw|
text = kw['text']
score = kw['score']
puts "#{text}: #{score.round(3)}"
end
library(kreuzberg)
config <- extraction_config(
keywords = list(enabled = TRUE)
)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Keywords extracted: %d\n", length(result$keywords)))
if (length(result$keywords) > 0) {
cat("Top keywords:\n")
for (i in seq_len(min(10L, length(result$keywords)))) {
cat(sprintf(" %d. %s\n", i, result$keywords[[i]]))
}
}
Quality Processing¶
Score extracted text for quality issues (0.0–1.0, where 1.0 is highest quality). Detects OCR artifacts, script content, navigation elements, and structural issues.
| Factor | Weight | Detects |
|---|---|---|
| OCR Artifacts | 30% | Scattered chars, repeated punctuation, malformed words |
| Script Content | 20% | JavaScript, CSS, HTML tags |
| Navigation Elements | 10% | Breadcrumbs, pagination, skip links |
| Document Structure | 20% | Sentence/paragraph length, punctuation distribution |
| Metadata Quality | 10% | Presence of title, author, subject |
Score ranges: 0.0–0.3 very low, 0.3–0.6 low, 0.6–0.8 moderate, 0.8–1.0 high.
Configuration¶
import asyncio
from kreuzberg import ExtractionConfig, extract_file
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True
)
result = await extract_file("document.pdf", config=config)
quality_score: float = result.quality_score or 0.0
print(f"Quality score: {quality_score:.2f}")
asyncio.run(main())
Example¶
from kreuzberg import extract_file, ExtractionConfig
config = ExtractionConfig(enable_quality_processing=True)
result = extract_file("scanned_document.pdf", config=config)
quality_score = result.quality_score or 0.0
if quality_score < 0.5:
print(f"Warning: Low quality extraction ({quality_score:.2f})")
print("Consider re-scanning with higher DPI or adjusting OCR settings")
else:
print(f"Quality score: {quality_score:.2f}")
import { extractFile } from '@kreuzberg/node';
const config = {
enableQualityProcessing: true,
};
const result = await extractFile('scanned_document.pdf', null, config);
console.log(`Content length: ${result.content.length} characters`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
use kreuzberg::{extract_file, ExtractionConfig};
let config = ExtractionConfig {
enable_quality_processing: true,
..Default::default()
};
let result = extract_file("scanned_document.pdf", None, &config).await?;
if let Some(score) = result.quality_score {
if score < 0.5 {
println!("Warning: Low quality extraction ({:.2})", score);
} else {
println!("Quality score: {:.2}", score);
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
enableQualityProcessing := true
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: &enableQualityProcessing,
}
result, err := kreuzberg.ExtractFileSync("scanned_document.pdf", config)
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
qualityScore := 0.0
if result.QualityScore != nil {
qualityScore = *result.QualityScore
}
if qualityScore < 0.5 {
fmt.Printf("Warning: Low quality extraction (%.2f)\n", qualityScore)
fmt.Println("Consider re-scanning with higher DPI or adjusting OCR settings")
} else {
fmt.Printf("Quality score: %.2f\n", qualityScore)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import java.util.Map;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned_document.pdf", config);
double qualityScore = result.getQualityScore() != null ? result.getQualityScore() : 0.0;
if (qualityScore < 0.5) {
System.out.println(String.format("Warning: Low quality extraction (%.2f)", qualityScore));
System.out.println("Consider re-scanning with higher DPI or adjusting OCR settings");
} else {
System.out.println(String.format("Quality score: %.2f", qualityScore));
}
using Kreuzberg;
var config = new ExtractionConfig
{
EnableQualityProcessing = true
};
var result = KreuzbergClient.ExtractFile(
"scanned_document.pdf",
config
);
var qualityScore = result.QualityScore;
if (qualityScore < 0.5)
{
Console.WriteLine(
$"Warning: Low quality extraction ({qualityScore:F2})"
);
Console.WriteLine(
"Consider re-scanning with higher DPI or adjusting OCR settings"
);
}
else
{
Console.WriteLine($"Quality score: {qualityScore:F2}");
}
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('scanned_document.pdf', config: config)
quality_score = result.quality_score || 0.0
if quality_score < 0.5
puts "Warning: Low quality extraction (#{quality_score.round(2)})"
puts "Consider re-scanning with higher DPI or adjusting OCR settings"
else
puts "Quality score: #{quality_score.round(2)}"
end
library(kreuzberg)
config <- extraction_config(enable_quality_processing = TRUE)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Quality Metrics:\n"))
cat(sprintf("Quality Score: %.2f\n", result$quality_score))
cat(sprintf("Content Length: %d characters\n", nchar(result$content)))
cat(sprintf("Pages: %d\n", result$pages))
Combining Features¶
import asyncio
from kreuzberg import (
extract_file,
ExtractionConfig,
ChunkingConfig,
EmbeddingConfig,
EmbeddingModelType,
LanguageDetectionConfig,
TokenReductionConfig,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
enable_quality_processing=True,
language_detection=LanguageDetectionConfig(enabled=True),
token_reduction=TokenReductionConfig(mode="moderate"),
chunking=ChunkingConfig(
max_chars=512,
max_overlap=50,
embedding=EmbeddingConfig(
model=EmbeddingModelType.preset("balanced"), normalize=True
),
),
)
result = await extract_file("document.pdf", config=config)
quality = result.quality_score or 0
print(f"Quality: {quality:.2f}")
print(f"Languages: {result.detected_languages}")
if result.chunks:
print(f"Chunks: {len(result.chunks)}")
asyncio.run(main())
import { extractFile } from '@kreuzberg/node';
const config = {
enableQualityProcessing: true,
languageDetection: {
enabled: true,
detectMultiple: true,
},
tokenReduction: {
mode: 'moderate',
preserveImportantWords: true,
},
chunking: {
maxChars: 512,
maxOverlap: 50,
embedding: {
preset: 'balanced',
},
},
keywords: {
algorithm: 'yake',
maxKeywords: 10,
},
};
const result = await extractFile('document.pdf', null, config);
console.log(`Content length: ${result.content.length}`);
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
}
if (result.chunks && result.chunks.length > 0) {
console.log(`Chunks: ${result.chunks.length}`);
}
use kreuzberg::{
extract_file, ExtractionConfig, ChunkingConfig, EmbeddingConfig,
LanguageDetectionConfig, TokenReductionConfig,
KeywordConfig, KeywordAlgorithm
};
let config = ExtractionConfig {
enable_quality_processing: true,
language_detection: Some(LanguageDetectionConfig {
enabled: true,
detect_multiple: true,
..Default::default()
}),
token_reduction: Some(TokenReductionConfig {
mode: "moderate".to_string(),
preserve_markdown: true,
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_characters: 512,
overlap: 50,
embedding: Some(EmbeddingConfig {
model: kreuzberg::EmbeddingModelType::Preset { name: "balanced".to_string() },
normalize: true,
..Default::default()
}),
..Default::default()
}),
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None, &config).await?;
if let Some(quality) = result.quality_score {
println!("Quality: {:.2}", quality);
}
println!("Languages: {:?}", result.detected_languages);
if let Some(keywords) = &result.extracted_keywords {
println!("Keywords: {:?}", keywords);
}
if let Some(chunks) = result.chunks {
if let Some(first_chunk) = chunks.first() {
if let Some(embedding) = &first_chunk.embedding {
println!("Chunks: {} with {} dimensions", chunks.len(), embedding.len());
}
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
maxChars := 512
maxOverlap := 50
minConfidence := 0.8
config := &kreuzberg.ExtractionConfig{
EnableQualityProcessing: true,
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: true,
MinConfidence: &minConfidence,
DetectMultiple: true,
},
TokenReduction: &kreuzberg.TokenReductionConfig{
Mode: "moderate",
PreserveMarkdown: true,
},
Chunking: &kreuzberg.ChunkingConfig{
MaxChars: &maxChars,
MaxOverlap: &maxOverlap,
Embedding: &kreuzberg.EmbeddingConfig{
Model: "balanced",
Normalize: true,
},
},
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
},
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
if result.QualityScore != nil {
fmt.Printf("Quality: %.2f\n", *result.QualityScore)
}
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
fmt.Printf("Keywords: %v\n", result.ExtractedKeywords)
if result.Chunks != nil && len(result.Chunks) > 0 && result.Chunks[0].Embedding != nil {
fmt.Printf("Chunks: %d with %d dimensions\n", len(result.Chunks), len(result.Chunks[0].Embedding))
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.ChunkingConfig;
import dev.kreuzberg.config.LanguageDetectionConfig;
import dev.kreuzberg.config.TokenReductionConfig;
ExtractionConfig config = ExtractionConfig.builder()
.enableQualityProcessing(true)
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.minConfidence(0.8)
.build())
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.chunking(ChunkingConfig.builder()
.maxChars(512)
.maxOverlap(50)
.embedding("balanced")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
System.out.printf("Quality: %.2f%n", result.getQualityScore());
System.out.println("Languages: " + result.getDetectedLanguages());
System.out.println("Content length: " + result.getContent().length() + " characters");
using System;
using System.Threading.Tasks;
using Kreuzberg;
async Task RunRagPipeline()
{
var config = new ExtractionConfig
{
EnableQualityProcessing = true,
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true,
MinConfidence = 0.8,
},
TokenReduction = new TokenReductionConfig
{
Mode = "moderate",
PreserveImportantWords = true,
},
Chunking = new ChunkingConfig
{
MaxChars = 512,
MaxOverlap = 50,
Embedding = new Dictionary<string, object?>
{
{ "preset", "balanced" },
},
Enabled = true,
},
Keywords = new KeywordConfig
{
Algorithm = "yake",
MaxKeywords = 10,
},
};
var result = await KreuzbergClient.ExtractFileAsync("document.pdf", config);
Console.WriteLine($"Content length: {result.Content.Length} characters");
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
if (result.Chunks?.Count > 0)
{
Console.WriteLine($"Total chunks: {result.Chunks.Count}");
var firstChunk = result.Chunks[0];
Console.WriteLine($"First chunk tokens: {firstChunk.Metadata.TokenCount}");
if (firstChunk.Embedding?.Length > 0)
{
Console.WriteLine($"Embedding dimensions: {firstChunk.Embedding.Length}");
}
}
Console.WriteLine($"Quality score: {result.QualityScore}");
if (result.ExtractedKeywords?.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", result.ExtractedKeywords)}");
}
}
await RunRagPipeline();
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
enable_quality_processing: true,
language_detection: Kreuzberg::Config::LanguageDetection.new(
enabled: true,
detect_multiple: true
),
token_reduction: Kreuzberg::Config::TokenReduction.new(mode: 'moderate'),
chunking: Kreuzberg::Config::Chunking.new(
max_characters: 512,
overlap: 50,
embedding: { normalize: true }
),
keywords: Kreuzberg::Config::Keywords.new(
algorithm: 'yake',
max_keywords: 10
)
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Languages: #{result.detected_languages.inspect}"
puts "Chunks: #{result.chunks&.length || 0}"
library(kreuzberg)
ocr_cfg <- ocr_config(backend = "tesseract", language = "eng", dpi = 300L)
chunking_cfg <- chunking_config(max_characters = 1200L, overlap = 250L)
config <- extraction_config(
ocr = ocr_cfg,
force_ocr = TRUE,
chunking = chunking_cfg,
language_detection = list(enabled = TRUE),
keywords = list(enabled = TRUE),
enable_quality_processing = TRUE,
output_format = "markdown"
)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Language: %s | Quality: %.2f | Chunks: %d | Keywords: %d\n",
result$detected_language, result$quality_score,
length(result$chunks), length(result$keywords)))