Extraction Basics¶
flowchart LR
Input[Input] --> FileOrBytes{File or Bytes?}
FileOrBytes -->|File Path| FileFunctions[File Functions]
FileOrBytes -->|In-Memory Data| BytesFunctions[Bytes Functions]
FileFunctions --> SingleOrBatch{Single or Batch?}
BytesFunctions --> SingleOrBatch
SingleOrBatch -->|Single| SingleMode[Single Extraction]
SingleOrBatch -->|Multiple| BatchMode[Batch Processing]
SingleMode --> SyncAsync{Sync or Async?}
BatchMode --> SyncAsync
SyncAsync -->|Sync| SyncFuncs[extract_file_sync<br/>extract_bytes_sync<br/>batch_extract_files_sync<br/>batch_extract_bytes_sync]
SyncAsync -->|Async| AsyncFuncs[extract_file<br/>extract_bytes<br/>batch_extract_files<br/>batch_extract_bytes]
style FileFunctions fill:#87CEEB
style BytesFunctions fill:#FFD700
style SyncFuncs fill:#90EE90
style AsyncFuncs fill:#FFB6C1 Kreuzberg provides 8 core extraction functions organized into 4 categories: file extraction, bytes extraction, batch file extraction, and batch bytes extraction. Each has both sync and async variants.
Extract from Files¶
Extract text, tables, and metadata from a file on disk.
Synchronous¶
from kreuzberg import extract_file_sync, ExtractionConfig
config: ExtractionConfig = ExtractionConfig()
result = extract_file_sync("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
metadata: dict = result.metadata
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
print(f"Metadata keys: {list(metadata.keys())}")
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
println!("{}", result.content);
println!("Tables: {}", result.tables.len());
println!("Metadata: {:?}", result.metadata);
Ok(())
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
System.out.println("Metadata: " + result.getMetadata());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
fmt.Printf("Tables: %d\n", len(result.Tables))
fmt.Printf("Metadata: %+v\n", result.Metadata)
}
import { initWasm, extractFile } from '@kreuzberg/wasm';
// Initialize WASM module once at app startup
await initWasm();
// Extract from file path (Node.js/Deno/Bun only)
const result = await extractFile('document.pdf');
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
Asynchronous¶
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
asyncio.run(main())
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.nio.file.Path;
import java.util.concurrent.CompletableFuture;
public class Example {
public static void main(String[] args) {
CompletableFuture<ExtractionResult> future =
Kreuzberg.extractFileAsync(Path.of("document.pdf"), null);
future.thenAccept(result -> {
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
}).join();
}
}
package main
import (
"context"
"fmt"
"log"
"time"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
}
import { initWasm, extractFile } from '@kreuzberg/wasm';
await initWasm();
// Extract from file path (async)
const result = await extractFile('document.pdf');
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
flowchart TD
Start[Choose Sync or Async] --> Context{Already in<br/>Async Context?}
Context -->|Yes| UseAsync[Use Async Variants]
Context -->|No| CheckUse{Use Case}
CheckUse -->|Simple Script| UseSyncSimple[Use Sync<br/>Simpler, same speed]
CheckUse -->|Multiple Files| CheckConcurrency{Concurrent<br/>Processing?}
CheckUse -->|Single File| UseSyncSingle[Use Sync<br/>Easier to read]
CheckConcurrency -->|Yes| UseAsyncConcurrent[Use Async<br/>Better concurrency]
CheckConcurrency -->|No| UseSyncBatch[Use Sync<br/>Adequate]
UseAsync --> AsyncFunctions[extract_file<br/>extract_bytes<br/>batch_extract_files<br/>batch_extract_bytes]
UseSyncSimple --> SyncFunctions[extract_file_sync<br/>extract_bytes_sync<br/>batch_extract_files_sync<br/>batch_extract_bytes_sync]
UseSyncSingle --> SyncFunctions
UseAsyncConcurrent --> AsyncFunctions
UseSyncBatch --> SyncFunctions
style UseAsync fill:#FFB6C1
style UseAsyncConcurrent fill:#FFB6C1
style UseSyncSimple fill:#90EE90
style UseSyncSingle fill:#90EE90
style UseSyncBatch fill:#90EE90 TypeScript / Node.js¶
All TypeScript/Node.js examples in this guide use the @kreuzberg/node package. Import synchronous APIs from the root module and asynchronous helpers from the same namespace. See the TypeScript API Reference for complete type definitions.
import { extractFileSync, ExtractionConfig } from '@kreuzberg/node';
// Extract a document using synchronous API
const result = extractFileSync('document.pdf', null, new ExtractionConfig());
console.log(result.content);
Ruby¶
Ruby bindings mirror the same function names (extract_file_sync, extract_bytes, batch_extract_files, etc.) under the Kreuzberg module. Configuration objects live under Kreuzberg::Config. See the Ruby API Reference for details.
require 'kreuzberg'
# Extract a document with OCR enabled
config = Kreuzberg::Config::Extraction.new(force_ocr: true)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts result.content
When to Use Async
Use async variants when you're already in an async context or processing multiple files concurrently. For simple scripts, sync variants are simpler and just as fast.
Extract from Bytes¶
Extract from data already loaded in memory.
Synchronous¶
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
try {
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionResult result = Kreuzberg.extractBytes(
data,
"application/pdf",
null
);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
package main
import (
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
data, err := os.ReadFile("document.pdf")
if err != nil {
log.Fatalf("read file: %v", err)
}
result, err := kreuzberg.ExtractBytesSync(data, "application/pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(result.Content)
}
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf');
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
Asynchronous¶
use kreuzberg::{extract_bytes, ExtractionConfig};
use tokio::fs;
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let data = fs::read("document.pdf").await?;
let result = extract_bytes(
&data,
"application/pdf",
&ExtractionConfig::default()
).await?;
println!("{}", result.content);
Ok(())
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.concurrent.CompletableFuture;
try {
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
CompletableFuture<ExtractionResult> future = Kreuzberg.extractBytesAsync(
data,
"application/pdf",
null
);
future.thenAccept(result -> System.out.println(result.getContent()))
.join();
} catch (IOException e) {
e.printStackTrace();
}
package main
import (
"context"
"log"
"os"
"time"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
data, err := os.ReadFile("document.pdf")
if err != nil {
log.Fatalf("read file: %v", err)
}
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
result, err := kreuzberg.ExtractBytes(ctx, data, "application/pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(result.Content)
}
import { initWasm, extractBytes } from '@kreuzberg/wasm';
await initWasm();
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf');
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
MIME Type Detection
Kreuzberg automatically detects MIME types from file extensions. When extracting from bytes, you must provide the MIME type explicitly.
Batch Processing¶
Process multiple files concurrently for better performance.
Batch Extract Files¶
from kreuzberg import batch_extract_files_sync, ExtractionConfig
files: list[str] = ["doc1.pdf", "doc2.docx", "doc3.pptx"]
config: ExtractionConfig = ExtractionConfig()
results = batch_extract_files_sync(files, config=config)
for i, result in enumerate(results):
char_count: int = len(result.content)
print(f"File {i + 1}: {char_count} characters")
use kreuzberg::{batch_extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let files = vec!["doc1.pdf", "doc2.docx", "doc3.pptx"];
let config = ExtractionConfig::default();
let results = batch_extract_file_sync(&files, None, &config)?;
for (i, result) in results.iter().enumerate() {
println!("File {}: {} characters", i + 1, result.content.len());
}
Ok(())
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
try {
List<String> files = Arrays.asList("doc1.pdf", "doc2.docx", "doc3.pptx");
List<ExtractionResult> results = Kreuzberg.batchExtractFiles(files, null);
for (int i = 0; i < results.size(); i++) {
ExtractionResult result = results.get(i);
System.out.println("File " + (i + 1) + ": " + result.getContent().length() + " characters");
}
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
files := []string{"doc1.pdf", "doc2.docx", "doc3.pptx"}
results, err := kreuzberg.BatchExtractFilesSync(files, nil)
if err != nil {
log.Fatalf("batch extract failed: %v", err)
}
for i, result := range results {
if result == nil {
continue
}
fmt.Printf("File %d: %d characters\n", i+1, len(result.Content))
}
}
import { initWasm, batchExtractFiles } from '@kreuzberg/wasm';
await initWasm();
const files = [
new File(['content1'], 'doc1.pdf', { type: 'application/pdf' }),
new File(['content2'], 'doc2.pdf', { type: 'application/pdf' })
];
const results = await batchExtractFiles(files);
results.forEach((result, index) => {
console.log(`Document ${index + 1}: ${result.content.substring(0, 100)}...`);
});
Batch Extract Bytes¶
from kreuzberg import batch_extract_bytes_sync, ExtractionConfig
files: list[str] = ["doc1.pdf", "doc2.docx"]
data_list: list[bytes] = []
mime_types: list[str] = []
for file in files:
with open(file, "rb") as f:
data_list.append(f.read())
mime_type: str = "application/pdf" if file.endswith(".pdf") else "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
mime_types.append(mime_type)
config: ExtractionConfig = ExtractionConfig()
results = batch_extract_bytes_sync(data_list, mime_types, config=config)
for i, result in enumerate(results):
char_count: int = len(result.content)
print(f"Document {i + 1}: {char_count} characters")
import { batchExtractBytesSync } from '@kreuzberg/node';
import { readFileSync } from 'fs';
const files = ['doc1.pdf', 'doc2.docx'];
const dataList = files.map(f => readFileSync(f));
const mimeTypes = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
];
const results = batchExtractBytesSync(dataList, mimeTypes);
results.forEach((result, i) => {
console.log(`Document ${i + 1}: ${result.content.length} characters`);
});
use kreuzberg::{batch_extract_bytes_sync, ExtractionConfig};
use std::fs;
fn main() -> kreuzberg::Result<()> {
let files = vec!["doc1.pdf", "doc2.docx"];
let data_list: Vec<Vec<u8>> = files.iter()
.map(|f| fs::read(f).expect("read file"))
.collect();
let mime_types: Vec<&str> = files.iter()
.map(|f| if f.ends_with(".pdf") {
"application/pdf"
} else {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
})
.collect();
let results = batch_extract_bytes_sync(
&data_list,
&mime_types,
&ExtractionConfig::default()
)?;
for (i, result) in results.iter().enumerate() {
println!("Document {}: {} characters", i + 1, result.content.len());
}
Ok(())
}
require 'kreuzberg'
files = ['doc1.pdf', 'doc2.docx']
data_list = files.map { |f| File.binread(f) }
mime_types = files.map do |f|
f.end_with?('.pdf') ? 'application/pdf' :
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
end
results = Kreuzberg.batch_extract_bytes_sync(
data_list,
mime_types
)
results.each_with_index do |result, i|
puts "Document #{i + 1}: #{result.content.length} characters"
end
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BytesWithMime;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
try {
List<String> files = Arrays.asList("doc1.pdf", "doc2.docx");
List<BytesWithMime> dataList = new ArrayList<>();
for (String file : files) {
byte[] data = Files.readAllBytes(Paths.get(file));
String mimeType = file.endsWith(".pdf") ? "application/pdf" :
"application/vnd.openxmlformats-officedocument.wordprocessingml.document";
dataList.add(new BytesWithMime(data, mimeType));
}
List<ExtractionResult> results = Kreuzberg.batchExtractBytes(dataList, null);
for (int i = 0; i < results.size(); i++) {
ExtractionResult result = results.get(i);
System.out.println("Document " + (i + 1) + ": " + result.getContent().length() + " characters");
}
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
package main
import (
"fmt"
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
files := []struct {
Path string
MIME string
}{
{"doc1.pdf", "application/pdf"},
{"doc2.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
}
items := make([]kreuzberg.BytesWithMime, 0, len(files))
for _, file := range files {
data, err := os.ReadFile(file.Path)
if err != nil {
log.Fatalf("read %s: %v", file.Path, err)
}
items = append(items, kreuzberg.BytesWithMime{
Data: data,
MimeType: file.MIME,
})
}
results, err := kreuzberg.BatchExtractBytesSync(items, nil)
if err != nil {
log.Fatalf("batch extract failed: %v", err)
}
for i, result := range results {
if result == nil {
continue
}
fmt.Printf("Document %d: %d characters\n", i+1, len(result.Content))
}
}
using Kreuzberg;
var documents = new[]
{
new BytesWithMime(await File.ReadAllBytesAsync("doc1.pdf"), "application/pdf"),
new BytesWithMime(await File.ReadAllBytesAsync("doc2.docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
};
var results = KreuzbergClient.BatchExtractBytesSync(documents, new ExtractionConfig());
Console.WriteLine($"Processed {results.Count} documents");
import { initWasm, batchExtractBytes } from '@kreuzberg/wasm';
await initWasm();
const dataList = [
new Uint8Array(buffer1),
new Uint8Array(buffer2)
];
const mimeTypes = [
'application/pdf',
'application/pdf'
];
const results = await batchExtractBytes(dataList, mimeTypes);
results.forEach((result, index) => {
console.log(`Document ${index + 1}: ${result.content.substring(0, 100)}...`);
});
flowchart TD
Start[Multiple Files to Process] --> Method{Processing Method}
Method -->|Sequential| Sequential[Process One by One]
Method -->|Batch| Batch[Batch Processing]
Sequential --> Seq1[File 1: 1.0s]
Seq1 --> Seq2[File 2: 1.0s]
Seq2 --> Seq3[File 3: 1.0s]
Seq3 --> Seq4[File 4: 1.0s]
Seq4 --> SeqTotal[Total: 4.0s]
Batch --> Parallel[Automatic Parallelization]
Parallel --> Par1[File 1: 1.0s]
Parallel --> Par2[File 2: 1.0s]
Parallel --> Par3[File 3: 1.0s]
Parallel --> Par4[File 4: 1.0s]
Par1 --> ParTotal[Total: ~1.2s]
Par2 --> ParTotal
Par3 --> ParTotal
Par4 --> ParTotal
SeqTotal --> Result[Sequential: Slow]
ParTotal --> ResultFast[Batch: 2-5x Faster]
style Sequential fill:#FFB6C1
style Batch fill:#90EE90
style SeqTotal fill:#FF6B6B
style ResultFast fill:#4CAF50 Performance
Batch processing provides automatic parallelization. For large sets of files, this can be 2-5x faster than processing files sequentially.
Supported Formats¶
Kreuzberg supports 56 file formats across 8 categories:
| Format | Extensions | Notes |
|---|---|---|
.pdf | Native text + OCR for scanned pages | |
| Images | .png, .jpg, .jpeg, .tiff, .bmp, .webp | Requires OCR backend |
| Office | .docx, .pptx, .xlsx | Modern formats via native parsers |
| Legacy Office | .doc, .ppt | Requires LibreOffice |
.eml, .msg | Full support including attachments | |
| Web | .html, .htm | Converted to Markdown with metadata |
| Text | .md, .txt, .xml, .json, .yaml, .toml, .csv | Direct extraction |
| Archives | .zip, .tar, .tar.gz, .tar.bz2 | Recursive extraction |
See the installation guide for optional dependencies (Tesseract, LibreOffice).
Page Tracking and Boundaries¶
Kreuzberg can track page boundaries and extract per-page content for supported formats.
When Page Tracking is Available¶
Page tracking is format-specific:
- PDF: Full byte-accurate page tracking with O(1) lookup performance
- PPTX: Slide boundary tracking (each slide is a "page")
- DOCX: Best-effort page break detection using explicit
<w:br type="page"/>tags - Other formats: No page tracking (boundaries and pages are
None/null)
Enabling Page Extraction¶
To extract per-page content, enable extract_pages:
graph LR
A[Document] --> B[Extract with PageConfig]
B --> C[Combined content]
B --> D[Pages array]
D --> E[Page 1]
D --> F[Page 2]
D --> G[Page N] See PageConfig documentation for configuration details.
Page Markers¶
Optionally insert page markers into the combined content string:
config = ExtractionConfig(
pages=PageConfig(
insert_page_markers=True,
marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
)
)
This adds markers like <!-- PAGE 1 --> at page boundaries in the content field, useful for LLMs to understand document structure.
Relationship with Chunking¶
When both page tracking and chunking are enabled, chunks automatically include first_page and last_page metadata showing which pages they span.
See Advanced Page Tracking for chunk-to-page mapping examples.
Error Handling¶
All extraction functions raise exceptions on failure:
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
KreuzbergError,
ParsingError,
OCRError,
ValidationError,
)
try:
result = extract_file_sync("document.pdf")
print(f"Extracted {len(result.content)} characters")
except ParsingError as e:
print(f"Failed to parse document: {e}")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
try:
config: ExtractionConfig = ExtractionConfig()
pdf_bytes: bytes = b"%PDF-1.4\n"
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
print(f"Invalid configuration: {e}")
except OCRError as e:
print(f"OCR failed: {e}")
except KreuzbergError as e:
print(f"Extraction failed: {e}")
use kreuzberg::{extract_file_sync, extract_bytes_sync, ExtractionConfig, KreuzbergError};
fn main() -> kreuzberg::Result<()> {
match extract_file_sync("document.pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted {} characters", result.content.len());
}
Err(KreuzbergError::Parsing { message, .. }) => {
eprintln!("Failed to parse document: {}", message);
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR processing failed: {}", message);
}
Err(KreuzbergError::MissingDependency { message, .. }) => {
eprintln!("Missing dependency: {}", message);
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
}
}
let pdf_bytes = b"%PDF-1.4\n...";
match extract_bytes_sync(pdf_bytes, "application/pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted: {}", &result.content[..100.min(result.content.len())]);
Ok(())
}
Err(KreuzbergError::Validation { message, .. }) => {
eprintln!("Invalid configuration: {}", message);
Err(KreuzbergError::Validation {
message: message.clone(),
source: None,
})
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR failed: {}", message);
Err(KreuzbergError::Ocr {
message: message.clone(),
source: None,
})
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
Err(e)
}
}
}
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('document.pdf')
puts result.content
rescue Kreuzberg::ValidationError => e
puts "Invalid configuration: #{e.message}"
rescue Kreuzberg::ParsingError => e
puts "Failed to parse document: #{e.message}"
rescue Kreuzberg::OCRError => e
puts "OCR processing failed: #{e.message}"
rescue Kreuzberg::MissingDependencyError => e
puts "Missing dependency: #{e.message}"
rescue Kreuzberg::Error => e
puts "Extraction error: #{e.message}"
rescue StandardError => e
puts "System error: #{e.message}"
end
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Extracted: " + result.getContent()
.substring(0, Math.min(100, result.getContent().length())));
} catch (IOException e) {
System.err.println("File not found: " + e.getMessage());
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
try {
byte[] pdfBytes = new byte[] { };
ExtractionResult result = Kreuzberg.extractBytes(pdfBytes, "application/pdf", null);
System.out.println("Extracted " + result.getContent().length() + " characters");
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
package main
import (
"errors"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
switch {
case errors.As(err, new(*kreuzberg.ValidationError)):
log.Fatalf("invalid configuration: %v", err)
case errors.As(err, new(*kreuzberg.ParsingError)):
log.Fatalf("failed to parse document: %v", err)
case errors.As(err, new(*kreuzberg.OCRError)):
log.Fatalf("OCR processing failed: %v", err)
case errors.As(err, new(*kreuzberg.MissingDependencyError)):
log.Fatalf("missing dependency: %v", err)
default:
log.Fatalf("extraction error: %v", err)
}
}
fmt.Println(result.Content)
}
using Kreuzberg;
try
{
var result = KreuzbergClient.ExtractFileSync("missing.pdf");
Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
Console.Error.WriteLine($"IO error: {ex.Message}");
throw;
}
catch (KreuzbergException ex)
{
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
throw;
}
import { initWasm, extractBytes } from '@kreuzberg/wasm';
try {
await initWasm();
const bytes = new Uint8Array(buffer);
const result = await extractBytes(bytes, 'application/pdf');
console.log(result.content);
} catch (error) {
if (error instanceof Error) {
console.error(`Extraction error: ${error.message}`);
} else {
throw error;
}
}
System Errors
OSError (Python), IOException (Rust), and system-level errors always bubble up to users. These indicate real system problems that need to be addressed (permissions, disk space, etc.).
Next Steps¶
- Configuration - Learn about all configuration options
- OCR Guide - Set up optical character recognition
- Advanced Features - Chunking, language detection, and more
- API Reference - Detailed API documentation