Quick Start¶
Get up and running with Kreuzberg in minutes.
Choosing Your TypeScript Package
Kreuzberg provides two TypeScript packages for different runtimes:
@kreuzberg/node– Use for Node.js servers and CLI tools (native performance, 100% speed)@kreuzberg/wasm– Use for browsers, Cloudflare Workers, Deno, Bun, and serverless (60-80% speed, cross-platform)
The examples below show both. Pick the one matching your runtime. See Platform Overview for detailed guidance.
Basic Extraction¶
Extract text from any supported document format:
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
fmt.Printf("Tables: %d\n", len(result.Tables))
fmt.Printf("Metadata: %+v\n", result.Metadata)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
System.out.println("Metadata: " + result.getMetadata());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
from kreuzberg import extract_file_sync, ExtractionConfig
config: ExtractionConfig = ExtractionConfig()
result = extract_file_sync("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
metadata: dict = result.metadata
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
print(f"Metadata keys: {list(metadata.keys())}")
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
println!("{}", result.content);
println!("Tables: {}", result.tables.len());
println!("Metadata: {:?}", result.metadata);
Ok(())
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}
Async Extraction¶
For better performance with I/O-bound operations:
package main
import (
"context"
"fmt"
"log"
"time"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.nio.file.Path;
import java.util.concurrent.CompletableFuture;
public class Example {
public static void main(String[] args) {
CompletableFuture<ExtractionResult> future =
Kreuzberg.extractFileAsync(Path.of("document.pdf"), null);
future.thenAccept(result -> {
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
}).join();
}
}
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
asyncio.run(main())
task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
const content = result.content;
const tableCount = result.tables.length;
console.log(`Content length: ${content.length} characters`);
console.log(`Tables: ${tableCount}`);
}
Not Applicable
Async extraction is an API-level feature. The CLI operates synchronously. Use language-specific bindings (Python, TypeScript, Rust, WASM) for async operations.
OCR Extraction¶
Extract text from images and scanned documents:
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
lang := "eng"
cfg := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: &lang,
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng")
)
result = extract_file_sync("scanned.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: Some("eng".to_string()),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"}
}
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
content = result.content
IO.puts("OCR Extracted content:")
IO.puts(content)
IO.puts("Metadata: #{inspect(result.metadata)}")
import { enableOcr, extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
await enableOcr();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, {
ocr: {
backend: 'tesseract-wasm',
language: 'eng',
},
});
console.log(result.content);
}
Batch Processing¶
Process multiple files concurrently:
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
files := []string{"doc1.pdf", "doc2.docx", "doc3.pptx"}
results, err := kreuzberg.BatchExtractFilesSync(files, nil)
if err != nil {
log.Fatalf("batch extract failed: %v", err)
}
for i, result := range results {
if result == nil {
continue
}
fmt.Printf("File %d: %d characters\n", i+1, len(result.Content))
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
try {
List<String> files = Arrays.asList("doc1.pdf", "doc2.docx", "doc3.pptx");
List<ExtractionResult> results = Kreuzberg.batchExtractFiles(files, null);
for (int i = 0; i < results.size(); i++) {
ExtractionResult result = results.get(i);
System.out.println("File " + (i + 1) + ": " + result.getContent().length() + " characters");
}
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
from kreuzberg import batch_extract_files_sync, ExtractionConfig
files: list[str] = ["doc1.pdf", "doc2.docx", "doc3.pptx"]
config: ExtractionConfig = ExtractionConfig()
results = batch_extract_files_sync(files, config=config)
for i, result in enumerate(results):
char_count: int = len(result.content)
print(f"File {i + 1}: {char_count} characters")
use kreuzberg::{batch_extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let files = vec!["doc1.pdf", "doc2.docx", "doc3.pptx"];
let config = ExtractionConfig::default();
let results = batch_extract_file_sync(&files, None, &config)?;
for (i, result) in results.iter().enumerate() {
println!("File {}: {} characters", i + 1, result.content.len());
}
Ok(())
}
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
Enum.each(results, fn result ->
IO.puts("File: #{result.mime_type}")
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Tables: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total files processed: #{length(results)}")
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInputs = document.getElementById('files') as HTMLInputElement;
const files = Array.from(fileInputs.files || []);
const results = await Promise.all(
files.map((file) => extractFromFile(file))
);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
Extract from Bytes¶
When you already have file content in memory:
package main
import (
"log"
"os"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
data, err := os.ReadFile("document.pdf")
if err != nil {
log.Fatalf("read file: %v", err)
}
result, err := kreuzberg.ExtractBytesSync(data, "application/pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
try {
byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionResult result = Kreuzberg.extractBytes(
data,
"application/pdf",
null
);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
# Read file into memory
{:ok, file_content} = File.read("document.pdf")
# Extract from bytes/binary data
{:ok, result} = Kreuzberg.extract(file_content, "application/pdf")
content = result.content
IO.puts("Extracted content:")
IO.puts(content)
IO.puts("MIME type: #{result.mime_type}")
IO.puts("Tables found: #{length(result.tables)}")
Not Applicable
The CLI operates on files from disk. For in-memory data processing, use language-specific bindings.
However, you can use CLI with pipes and temporary files:
Advanced Configuration¶
Customize extraction behavior:
using Kreuzberg;
var config = new ExtractionConfig
{
Ocr = new OcrConfig { Backend = "tesseract", Language = "eng+deu" },
Chunking = new ChunkingConfig { MaxChars = 1000, MaxOverlap = 100 },
TokenReduction = new TokenReductionConfig { Enabled = true },
LanguageDetection = new LanguageDetectionConfig
{
Enabled = true,
DetectMultiple = true
},
UseCache = true,
EnableQualityProcessing = true
};
var result = KreuzbergClient.ExtractFileSync("document.pdf", config);
foreach (var chunk in result.Chunks)
{
Console.WriteLine($"Chunk: {chunk.Content[..Math.Min(100, chunk.Content.Length)]}");
}
if (result.DetectedLanguages?.Count > 0)
{
Console.WriteLine($"Languages: {string.Join(", ", result.DetectedLanguages)}");
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
lang := "eng+deu" // Multiple languages
chunkSize := 1000
chunkOverlap := 100
useCache := true
enableQuality := true
detectMultiple := true
config := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: &lang,
},
Chunking: &kreuzberg.ChunkingConfig{
ChunkSize: &chunkSize,
ChunkOverlap: &chunkOverlap,
},
LanguageDetection: &kreuzberg.LanguageDetectionConfig{
Enabled: &useCache,
DetectMultiple: &detectMultiple,
},
UseCache: &useCache,
EnableQualityProcessing: &enableQuality,
}
result, err := kreuzberg.ExtractFileSync("document.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Access chunks
if len(result.Chunks) > 0 {
snippet := result.Chunks[0].Content
if len(snippet) > 100 {
snippet = snippet[:100]
}
fmt.Printf("First chunk: %s...\n", snippet)
}
// Access detected languages
if len(result.DetectedLanguages) > 0 {
fmt.Printf("Languages: %v\n", result.DetectedLanguages)
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.*;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng+deu")
.build())
.chunking(ChunkingConfig.builder()
.maxChars(1000)
.maxOverlap(100)
.build())
.tokenReduction(TokenReductionConfig.builder()
.mode("moderate")
.preserveImportantWords(true)
.build())
.languageDetection(LanguageDetectionConfig.builder()
.enabled(true)
.build())
.useCache(true)
.enableQualityProcessing(true)
.build();
ExtractionResult result = Kreuzberg.extractFile("document.pdf", config);
if (!result.getDetectedLanguages().isEmpty()) {
System.out.println("Languages: " + result.getDetectedLanguages());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import (
extract_file_sync,
ExtractionConfig,
OcrConfig,
ChunkingConfig,
TokenReductionConfig,
LanguageDetectionConfig,
)
config = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng+deu"),
chunking=ChunkingConfig(max_chars=1000, max_overlap=100),
token_reduction=TokenReductionConfig(enabled=True),
language_detection=LanguageDetectionConfig(
enabled=True, detect_multiple=True
),
use_cache=True,
enable_quality_processing=True,
)
result = extract_file_sync("document.pdf", config=config)
for chunk in result.chunks:
print(f"Chunk: {chunk.content[:100]}")
if result.detected_languages:
print(f"Languages: {result.detected_languages}")
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
ocr: Kreuzberg::Config::OCR.new(
backend: 'tesseract',
language: 'eng+deu'
),
chunking: Kreuzberg::Config::Chunking.new(
max_chars: 1000,
max_overlap: 100
),
language_detection: Kreuzberg::Config::LanguageDetection.new,
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
result.chunks&.each { |chunk| puts chunk[0..100] }
puts "Languages: #{result.detected_languages.inspect}"
use kreuzberg::{
extract_file_sync, ChunkingConfig, ExtractionConfig, LanguageDetectionConfig, OcrConfig,
};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: Some("eng+deu".to_string()),
..Default::default()
}),
chunking: Some(ChunkingConfig {
max_chars: 1000,
max_overlap: 100,
..Default::default()
}),
language_detection: Some(LanguageDetectionConfig {
enabled: true,
detect_multiple: true,
..Default::default()
}),
use_cache: true,
enable_quality_processing: true,
..Default::default()
};
let result = extract_file_sync("document.pdf", None, &config)?;
if let Some(chunks) = result.chunks {
for chunk in chunks {
println!("Chunk: {}...", &chunk[..100.min(chunk.len())]);
}
}
if let Some(languages) = result.detected_languages {
println!("Languages: {:?}", languages);
}
Ok(())
}
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"},
chunking: %{"max_chars" => 1000, "max_overlap" => 100},
language_detection: %{"enabled" => true},
use_cache: true,
force_ocr: false
}
{:ok, result} = Kreuzberg.extract_file("document.pdf", nil, config)
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Detected languages: #{inspect(result.detected_languages)}")
IO.puts("Chunks: #{if result.chunks, do: length(result.chunks), else: 0}")
import { extractFileSync } from '@kreuzberg/node';
const config = {
ocr: {
backend: 'tesseract',
language: 'eng+deu',
},
chunking: {
maxChars: 1000,
maxOverlap: 100,
},
tokenReduction: {
mode: 'aggressive',
},
languageDetection: {
enabled: true,
detectMultiple: true,
},
useCache: true,
enableQualityProcessing: true,
};
const result = extractFileSync('document.pdf', null, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const config = {
ocr: {
backend: 'tesseract-wasm',
language: 'eng',
},
chunking: {
maxChars: 1000,
chunkOverlap: 100,
},
enable_language_detection: true,
enable_quality: true,
};
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, config);
if (result.chunks) {
for (const chunk of result.chunks) {
console.log(`Chunk: ${chunk.content.substring(0, 100)}...`);
}
}
if (result.detectedLanguages) {
console.log(`Languages: ${result.detectedLanguages.join(', ')}`);
}
}
Configure extraction behavior via command-line flags or config files:
# Using command-line flags
kreuzberg extract document.pdf \
--ocr \
--chunk --chunk-size 1000 --chunk-overlap 100 \
--detect-language \
--quality
# Using config file
kreuzberg extract document.pdf --config kreuzberg.toml
kreuzberg.toml:
[ocr]
backend = "tesseract"
language = "eng"
[chunking]
max_chunk_size = 1000
overlap = 100
[language_detection]
enabled = true
detect_multiple = true
enable_quality_processing = true
use_cache = true
kreuzberg.yaml:
Working with Metadata¶
Access format-specific metadata from extracted documents:
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig { ExtractMetadata = true }
};
var result = KreuzbergClient.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var pdfMeta = result.Metadata.Format.Pdf;
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
Console.WriteLine($"Author: {pdfMeta.Author}");
Console.WriteLine($"Title: {pdfMeta.Title}");
}
var htmlResult = KreuzbergClient.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
var htmlMeta = htmlResult.Metadata.Format.Html;
Console.WriteLine($"Title: {htmlMeta.Title}");
Console.WriteLine($"Description: {htmlMeta.Description}");
// Access keywords as array
if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.CanonicalUrl != null)
{
Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
}
// Access Open Graph fields from dictionary
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
{
if (htmlMeta.OpenGraph.ContainsKey("image"))
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
if (htmlMeta.OpenGraph.ContainsKey("title"))
Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
if (htmlMeta.OpenGraph.ContainsKey("type"))
Console.WriteLine($"Open Graph Type: {htmlMeta.OpenGraph["type"]}");
}
// Access Twitter Card fields from dictionary
if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
{
if (htmlMeta.TwitterCard.ContainsKey("card"))
Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
if (htmlMeta.TwitterCard.ContainsKey("creator"))
Console.WriteLine($"Twitter Creator: {htmlMeta.TwitterCard["creator"]}");
}
// Access new fields
if (htmlMeta.Language != null)
Console.WriteLine($"Language: {htmlMeta.Language}");
if (htmlMeta.TextDirection != null)
Console.WriteLine($"Text Direction: {htmlMeta.TextDirection}");
// Access headers
if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");
// Access links
if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
{
foreach (var link in htmlMeta.Links)
Console.WriteLine($"Link: {link.Href} ({link.Text})");
}
// Access images
if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");
// Access structured data
if (htmlMeta.StructuredData != null && htmlMeta.StructuredData.Count > 0)
Console.WriteLine($"Structured Data items: {htmlMeta.StructuredData.Count}");
}
package main
import (
"fmt"
"log"
"strings"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract pdf: %v", err)
}
// Access PDF metadata
if pdf, ok := result.Metadata.PdfMetadata(); ok {
if pdf.PageCount != nil {
fmt.Printf("Pages: %d\n", *pdf.PageCount)
}
if pdf.Author != nil {
fmt.Printf("Author: %s\n", *pdf.Author)
}
if pdf.Title != nil {
fmt.Printf("Title: %s\n", *pdf.Title)
}
}
// Access HTML metadata
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
if err != nil {
log.Fatalf("extract html: %v", err)
}
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
if html.Title != nil {
fmt.Printf("Title: %s\n", *html.Title)
}
if html.Description != nil {
fmt.Printf("Description: %s\n", *html.Description)
}
// Access keywords as array
if len(html.Keywords) > 0 {
fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
}
// Access canonical URL (renamed from canonical)
if html.CanonicalURL != nil {
fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
}
// Access Open Graph fields from map
if len(html.OpenGraph) > 0 {
if image, ok := html.OpenGraph["image"]; ok {
fmt.Printf("Open Graph Image: %s\n", image)
}
if ogTitle, ok := html.OpenGraph["title"]; ok {
fmt.Printf("Open Graph Title: %s\n", ogTitle)
}
if ogType, ok := html.OpenGraph["type"]; ok {
fmt.Printf("Open Graph Type: %s\n", ogType)
}
}
// Access Twitter Card fields from map
if len(html.TwitterCard) > 0 {
if card, ok := html.TwitterCard["card"]; ok {
fmt.Printf("Twitter Card Type: %s\n", card)
}
if creator, ok := html.TwitterCard["creator"]; ok {
fmt.Printf("Twitter Creator: %s\n", creator)
}
}
// Access new fields
if html.Language != nil {
fmt.Printf("Language: %s\n", *html.Language)
}
if html.TextDirection != nil {
fmt.Printf("Text Direction: %s\n", *html.TextDirection)
}
// Access headers
if len(html.Headers) > 0 {
headers := make([]string, len(html.Headers))
for i, h := range html.Headers {
headers[i] = h.Text
}
fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
}
// Access links
if len(html.Links) > 0 {
for _, link := range html.Links {
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
}
}
// Access images
if len(html.Images) > 0 {
for _, image := range html.Images {
fmt.Printf("Image: %s\n", image.Src)
}
}
// Access structured data
if len(html.StructuredData) > 0 {
fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
// Access PDF metadata
@SuppressWarnings("unchecked")
Map<String, Object> pdfMeta = (Map<String, Object>) result.getMetadata().get("pdf");
if (pdfMeta != null) {
System.out.println("Pages: " + pdfMeta.get("page_count"));
System.out.println("Author: " + pdfMeta.get("author"));
System.out.println("Title: " + pdfMeta.get("title"));
}
// Access HTML metadata
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
@SuppressWarnings("unchecked")
Map<String, Object> htmlMeta = (Map<String, Object>) htmlResult.getMetadata().get("html");
if (htmlMeta != null) {
System.out.println("Title: " + htmlMeta.get("title"));
System.out.println("Description: " + htmlMeta.get("description"));
// Access keywords as array
@SuppressWarnings("unchecked")
List<String> keywords = (List<String>) htmlMeta.get("keywords");
if (keywords != null) {
System.out.println("Keywords: " + keywords);
}
// Access canonical URL (renamed from canonical)
String canonicalUrl = (String) htmlMeta.get("canonical_url");
if (canonicalUrl != null) {
System.out.println("Canonical URL: " + canonicalUrl);
}
// Access Open Graph fields from map
@SuppressWarnings("unchecked")
Map<String, String> openGraph = (Map<String, String>) htmlMeta.get("open_graph");
if (openGraph != null) {
System.out.println("Open Graph Image: " + openGraph.get("image"));
System.out.println("Open Graph Title: " + openGraph.get("title"));
System.out.println("Open Graph Type: " + openGraph.get("type"));
}
// Access Twitter Card fields from map
@SuppressWarnings("unchecked")
Map<String, String> twitterCard = (Map<String, String>) htmlMeta.get("twitter_card");
if (twitterCard != null) {
System.out.println("Twitter Card Type: " + twitterCard.get("card"));
System.out.println("Twitter Creator: " + twitterCard.get("creator"));
}
// Access new fields
String language = (String) htmlMeta.get("language");
if (language != null) {
System.out.println("Language: " + language);
}
String textDirection = (String) htmlMeta.get("text_direction");
if (textDirection != null) {
System.out.println("Text Direction: " + textDirection);
}
// Access headers
@SuppressWarnings("unchecked")
List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlMeta.get("headers");
if (headers != null) {
headers.stream()
.map(h -> h.get("text"))
.forEach(text -> System.out.print(text + ", "));
System.out.println();
}
// Access links
@SuppressWarnings("unchecked")
List<Map<String, Object>> links = (List<Map<String, Object>>) htmlMeta.get("links");
if (links != null) {
for (Map<String, Object> link : links) {
System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
}
}
// Access images
@SuppressWarnings("unchecked")
List<Map<String, Object>> images = (List<Map<String, Object>>) htmlMeta.get("images");
if (images != null) {
for (Map<String, Object> image : images) {
System.out.println("Image: " + image.get("src"));
}
}
// Access structured data
@SuppressWarnings("unchecked")
List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlMeta.get("structured_data");
if (structuredData != null) {
System.out.println("Structured data items: " + structuredData.size());
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
pdf_meta: dict = result.metadata.get("pdf", {})
if pdf_meta:
print(f"Pages: {pdf_meta.get('page_count')}")
print(f"Author: {pdf_meta.get('author')}")
print(f"Title: {pdf_meta.get('title')}")
result = extract_file_sync("page.html", config=ExtractionConfig())
html_meta: dict = result.metadata.get("html", {})
if html_meta:
print(f"Title: {html_meta.get('title')}")
print(f"Description: {html_meta.get('description')}")
# Access keywords as array
keywords = html_meta.get('keywords', [])
if keywords:
print(f"Keywords: {', '.join(keywords)}")
# Access canonical URL (renamed from canonical)
canonical_url = html_meta.get('canonical_url')
if canonical_url:
print(f"Canonical URL: {canonical_url}")
# Access Open Graph fields from map
open_graph = html_meta.get('open_graph', {})
if open_graph:
if 'image' in open_graph:
print(f"Open Graph Image: {open_graph['image']}")
if 'title' in open_graph:
print(f"Open Graph Title: {open_graph['title']}")
if 'type' in open_graph:
print(f"Open Graph Type: {open_graph['type']}")
# Access Twitter Card fields from map
twitter_card = html_meta.get('twitter_card', {})
if twitter_card:
if 'card' in twitter_card:
print(f"Twitter Card Type: {twitter_card['card']}")
if 'creator' in twitter_card:
print(f"Twitter Creator: {twitter_card['creator']}")
# Access new fields
language = html_meta.get('language')
if language:
print(f"Language: {language}")
text_direction = html_meta.get('text_direction')
if text_direction:
print(f"Text Direction: {text_direction}")
# Access headers
headers = html_meta.get('headers', [])
if headers:
print(f"Headers: {', '.join([h['text'] for h in headers])}")
# Access links
links = html_meta.get('links', [])
if links:
for link in links:
print(f"Link: {link.get('href')} ({link.get('text')})")
# Access images
images = html_meta.get('images', [])
if images:
for image in images:
print(f"Image: {image.get('src')}")
# Access structured data
structured_data = html_meta.get('structured_data', [])
if structured_data:
print(f"Structured data items: {len(structured_data)}")
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Access PDF metadata
if result.metadata['pdf']
pdf_meta = result.metadata['pdf']
puts "Pages: #{pdf_meta['page_count']}"
puts "Author: #{pdf_meta['author']}"
puts "Title: #{pdf_meta['title']}"
end
# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
if html_result.metadata['html']
html_meta = html_result.metadata['html']
puts "Title: #{html_meta['title']}"
puts "Description: #{html_meta['description']}"
# Access keywords as array
puts "Keywords: #{html_meta['keywords']}"
# Access canonical URL (renamed from canonical)
puts "Canonical URL: #{html_meta['canonical_url']}" if html_meta['canonical_url']
# Access Open Graph fields from map
open_graph = html_meta['open_graph'] || {}
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
# Access Twitter Card fields from map
twitter_card = html_meta['twitter_card'] || {}
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
# Access new fields
puts "Language: #{html_meta['language']}" if html_meta['language']
puts "Text Direction: #{html_meta['text_direction']}" if html_meta['text_direction']
# Access headers
if html_meta['headers']
puts "Headers: #{html_meta['headers'].map { |h| h['text'] }.join(', ')}"
end
# Access links
if html_meta['links']
html_meta['links'].each do |link|
puts "Link: #{link['href']} (#{link['text']})"
end
end
# Access images
if html_meta['images']
html_meta['images'].each do |image|
puts "Image: #{image['src']}"
end
end
# Access structured data
if html_meta['structured_data']
puts "Structured data items: #{html_meta['structured_data'].length}"
end
end
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
if let Some(pdf_meta) = result.metadata.pdf {
if let Some(pages) = pdf_meta.page_count {
println!("Pages: {}", pages);
}
if let Some(author) = pdf_meta.author {
println!("Author: {}", author);
}
if let Some(title) = pdf_meta.title {
println!("Title: {}", title);
}
}
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
if let Some(html_meta) = html_result.metadata.html {
if let Some(title) = html_meta.title {
println!("Title: {}", title);
}
if let Some(desc) = html_meta.description {
println!("Description: {}", desc);
}
// Access keywords array
println!("Keywords: {:?}", html_meta.keywords);
// Access canonical URL (renamed from canonical)
if let Some(canonical) = html_meta.canonical_url {
println!("Canonical URL: {}", canonical);
}
// Access Open Graph fields as a map
if let Some(og_image) = html_meta.open_graph.get("image") {
println!("Open Graph Image: {}", og_image);
}
if let Some(og_title) = html_meta.open_graph.get("title") {
println!("Open Graph Title: {}", og_title);
}
// Access Twitter Card fields as a map
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
println!("Twitter Card Type: {}", twitter_card);
}
// Access new fields
if let Some(lang) = html_meta.language {
println!("Language: {}", lang);
}
// Access headers
if !html_meta.headers.is_empty() {
for header in &html_meta.headers {
println!("Header (level {}): {}", header.level, header.text);
}
}
// Access links
if !html_meta.links.is_empty() {
for link in &html_meta.links {
println!("Link: {} ({})", link.href, link.text);
}
}
// Access images
if !html_meta.images.is_empty() {
for image in &html_meta.images {
println!("Image: {}", image.src);
}
}
// Access structured data
if !html_meta.structured_data.is_empty() {
println!("Structured data items: {}", html_meta.structured_data.len());
}
}
Ok(())
}
{:ok, result} = Kreuzberg.extract_file("document.pdf")
# Access format-specific metadata
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
# Check PDF-specific metadata
case metadata["pdf"] do
pdf_meta when is_map(pdf_meta) ->
IO.puts("Page count: #{pdf_meta["page_count"]}")
IO.puts("Author: #{pdf_meta["author"]}")
IO.puts("Title: #{pdf_meta["title"]}")
_ ->
IO.puts("No PDF metadata available")
end
# Check HTML-specific metadata
case metadata["html"] do
html_meta when is_map(html_meta) ->
IO.puts("HTML keywords: #{inspect(html_meta["keywords"])}")
_ ->
IO.puts("No HTML metadata available")
end
import { extractFileSync } from '@kreuzberg/node';
const result = extractFileSync('document.pdf');
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.pageCount) {
console.log(`Pages: ${result.metadata.pageCount}`);
}
const htmlResult = extractFileSync('page.html');
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
const htmlMeta = htmlResult.metadata;
if (htmlMeta.title) {
console.log(`Title: ${htmlMeta.title}`);
}
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(', ')}`);
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.canonicalUrl) {
console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
}
// Access Open Graph fields from map
if (htmlMeta.openGraph) {
if (htmlMeta.openGraph['image']) {
console.log(`Open Graph Image: ${htmlMeta.openGraph['image']}`);
}
if (htmlMeta.openGraph['title']) {
console.log(`Open Graph Title: ${htmlMeta.openGraph['title']}`);
}
if (htmlMeta.openGraph['type']) {
console.log(`Open Graph Type: ${htmlMeta.openGraph['type']}`);
}
}
// Access Twitter Card fields from map
if (htmlMeta.twitterCard) {
if (htmlMeta.twitterCard['card']) {
console.log(`Twitter Card Type: ${htmlMeta.twitterCard['card']}`);
}
if (htmlMeta.twitterCard['creator']) {
console.log(`Twitter Creator: ${htmlMeta.twitterCard['creator']}`);
}
}
// Access new fields
if (htmlMeta.language) {
console.log(`Language: ${htmlMeta.language}`);
}
if (htmlMeta.textDirection) {
console.log(`Text Direction: ${htmlMeta.textDirection}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map(h => h.text).join(', ')}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
// Access common metadata fields
if (result.metadata.title) {
console.log(`Title: ${result.metadata.title}`);
}
// Access format-specific metadata
const metadata = result.metadata;
// For HTML files
if (metadata.html) {
const htmlMeta = metadata.html;
console.log(`HTML Title: ${htmlMeta.title}`);
console.log(`Description: ${htmlMeta.description}`);
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(', ')}`);
}
// Access canonical URL
if (htmlMeta.canonical_url) {
console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
}
// Access Open Graph fields
if (htmlMeta.open_graph) {
if (htmlMeta.open_graph['title']) {
console.log(`OG Title: ${htmlMeta.open_graph['title']}`);
}
if (htmlMeta.open_graph['image']) {
console.log(`OG Image: ${htmlMeta.open_graph['image']}`);
}
}
// Access Twitter Card fields
if (htmlMeta.twitter_card && htmlMeta.twitter_card['card']) {
console.log(`Twitter Card Type: ${htmlMeta.twitter_card['card']}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(', ')}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link: any) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image: any) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
}
}
// For PDF files
if (metadata.pdf) {
const pdfMeta = metadata.pdf;
if (pdfMeta.page_count) {
console.log(`Pages: ${pdfMeta.page_count}`);
}
if (pdfMeta.author) {
console.log(`Author: ${pdfMeta.author}`);
}
}
}
Extract and parse metadata using JSON output:
# Extract with metadata (JSON format includes metadata automatically)
kreuzberg extract document.pdf --format json
# Save to file and parse metadata
kreuzberg extract document.pdf --format json > result.json
# Extract PDF metadata
cat result.json | jq '.metadata.pdf'
# Extract HTML metadata
kreuzberg extract page.html --format json | jq '.metadata.html'
# Get specific fields
kreuzberg extract document.pdf --format json | \
jq '.metadata | {page_count, author, title}'
# Process multiple files
kreuzberg batch documents/*.pdf --format json > all_metadata.json
JSON Output Structure:
Kreuzberg extracts format-specific metadata for: - PDF: page count, title, author, subject, keywords, dates - HTML: Rich metadata including SEO tags, Open Graph, Twitter Card, structured data, headers, links, images - Excel: sheet count, sheet names - Email: from, to, CC, BCC, message ID, attachments - PowerPoint: title, author, description, fonts - Images: dimensions, format, EXIF data - Archives: format, file count, file list, sizes - XML: element count, unique elements - Text/Markdown: word count, line count, headers, links
HTML Metadata Structure (v4.0+)
HTML metadata has been restructured for better organization: - keywords: Now a Vec<String> array (was Option<String>) - canonical → canonical_url: Renamed for clarity - Open Graph fields: Consolidated into open_graph: Map<String, String> (replacing individual og_* fields) - Twitter Card fields: Consolidated into twitter_card: Map<String, String> (replacing individual twitter_* fields) - New fields: headers, links, images, structured_data, language, text_direction, meta_tags
See Types Reference for complete HTML metadata reference and examples.
See Types Reference for complete metadata reference.
Working with Tables¶
Extract and process tables from documents:
using Kreuzberg;
var result = KreuzbergClient.ExtractFileSync("document.pdf", new ExtractionConfig());
foreach (var table in result.Tables)
{
Console.WriteLine($"Table with {table.Cells.Count} rows");
Console.WriteLine(table.Markdown);
foreach (var row in table.Cells)
{
Console.WriteLine(string.Join(" | ", row));
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Iterate over tables
for _, table := range result.Tables {
fmt.Printf("Table with %d rows\n", len(table.Cells))
fmt.Println(table.Markdown) // Markdown representation
// Access cells
for _, row := range table.Cells {
fmt.Println(row)
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
for (Table table : result.getTables()) {
System.out.println("Table with " + table.cells().size() + " rows");
System.out.println(table.markdown());
for (List<String> row : table.cells()) {
System.out.println(row);
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
result = extract_file_sync("document.pdf", config=ExtractionConfig())
for table in result.tables:
row_count: int = len(table.cells)
print(f"Table with {row_count} rows")
print(table.markdown)
for row in table.cells:
print(row)
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
for table in &result.tables {
println!("Table with {} rows", table.cells.len());
println!("{}", table.markdown);
for row in &table.cells {
println!("{:?}", row);
}
}
Ok(())
}
{:ok, result} = Kreuzberg.extract_file("document.pdf")
tables = result.tables
IO.puts("Total tables found: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
IO.puts("\n--- Table #{index} ---")
# Access table cells
cells = table["cells"] || []
IO.puts("Rows: #{length(cells)}")
# Access table markdown representation
markdown = table["markdown"]
IO.puts("Markdown representation:")
IO.puts(markdown)
end)
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
for (const table of result.tables) {
console.log(`Table with ${table.cells.length} rows`);
console.log(`Page: ${table.pageNumber}`);
console.log(table.markdown);
}
}
Extract and process tables from documents:
# Extract with JSON format (includes tables when detected)
kreuzberg extract document.pdf --format json
# Save tables to JSON
kreuzberg extract spreadsheet.xlsx --format json > tables.json
# Extract and parse table markdown
kreuzberg extract document.pdf --format json | \
jq '.tables[]? | .markdown'
# Get table cells
kreuzberg extract document.pdf --format json | \
jq '.tables[]? | .cells'
# Batch extract tables from multiple files
kreuzberg batch documents/**/*.pdf --format json > all_tables.json
JSON Table Structure:
Error Handling¶
Handle extraction errors gracefully:
using Kreuzberg;
try
{
var result = KreuzbergClient.ExtractFileSync("missing.pdf");
Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
Console.Error.WriteLine($"IO error: {ex.Message}");
throw;
}
catch (KreuzbergException ex)
{
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
throw;
}
package main
import (
"errors"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
switch {
case errors.As(err, new(*kreuzberg.ValidationError)):
log.Fatalf("invalid configuration: %v", err)
case errors.As(err, new(*kreuzberg.ParsingError)):
log.Fatalf("failed to parse document: %v", err)
case errors.As(err, new(*kreuzberg.OCRError)):
log.Fatalf("OCR processing failed: %v", err)
case errors.As(err, new(*kreuzberg.MissingDependencyError)):
log.Fatalf("missing dependency: %v", err)
default:
log.Fatalf("extraction error: %v", err)
}
}
fmt.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Extracted: " + result.getContent()
.substring(0, Math.min(100, result.getContent().length())));
} catch (IOException e) {
System.err.println("File not found: " + e.getMessage());
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
try {
byte[] pdfBytes = new byte[] { };
ExtractionResult result = Kreuzberg.extractBytes(pdfBytes, "application/pdf", null);
System.out.println("Extracted " + result.getContent().length() + " characters");
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
KreuzbergError,
ParsingError,
OCRError,
ValidationError,
)
try:
result = extract_file_sync("document.pdf")
print(f"Extracted {len(result.content)} characters")
except ParsingError as e:
print(f"Failed to parse document: {e}")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
try:
config: ExtractionConfig = ExtractionConfig()
pdf_bytes: bytes = b"%PDF-1.4\n"
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
print(f"Invalid configuration: {e}")
except OCRError as e:
print(f"OCR failed: {e}")
except KreuzbergError as e:
print(f"Extraction failed: {e}")
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('document.pdf')
puts result.content
rescue Kreuzberg::ValidationError => e
puts "Invalid configuration: #{e.message}"
rescue Kreuzberg::ParsingError => e
puts "Failed to parse document: #{e.message}"
rescue Kreuzberg::OCRError => e
puts "OCR processing failed: #{e.message}"
rescue Kreuzberg::MissingDependencyError => e
puts "Missing dependency: #{e.message}"
rescue Kreuzberg::Error => e
puts "Extraction error: #{e.message}"
rescue StandardError => e
puts "System error: #{e.message}"
end
use kreuzberg::{extract_file_sync, extract_bytes_sync, ExtractionConfig, KreuzbergError};
fn main() -> kreuzberg::Result<()> {
match extract_file_sync("document.pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted {} characters", result.content.len());
}
Err(KreuzbergError::Parsing { message, .. }) => {
eprintln!("Failed to parse document: {}", message);
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR processing failed: {}", message);
}
Err(KreuzbergError::MissingDependency { message, .. }) => {
eprintln!("Missing dependency: {}", message);
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
}
}
let pdf_bytes = b"%PDF-1.4\n...";
match extract_bytes_sync(pdf_bytes, "application/pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted: {}", &result.content[..100.min(result.content.len())]);
Ok(())
}
Err(KreuzbergError::Validation { message, .. }) => {
eprintln!("Invalid configuration: {}", message);
Err(KreuzbergError::Validation {
message: message.clone(),
source: None,
})
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR failed: {}", message);
Err(KreuzbergError::Ocr {
message: message.clone(),
source: None,
})
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
Err(e)
}
}
}
# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Successfully extracted content")
IO.puts("Content length: #{byte_size(result.content)} characters")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")
case result do
{:ok, data} ->
IO.puts("File processed successfully")
{:error, error} ->
IO.puts("Error details: #{inspect(error)}")
end
# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
{:ok, result} ->
IO.puts("Content: #{result.content}")
{:error, msg} when is_binary(msg) ->
IO.puts("Validation error: #{msg}")
{:error, reason} ->
IO.puts("Unknown error: #{inspect(reason)}")
end
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
try {
const result = await extractFromFile(file);
console.log(result.content);
} catch (error) {
if (error instanceof Error) {
console.error(`Extraction error: ${error.message}`);
} else {
throw error;
}
}
}
Next Steps¶
- Contributing - Learn how to contribute to Kreuzberg