Quick Start¶
This guide walks you through Kreuzberg's core API — extracting text, handling errors, running OCR, and working with metadata. Install your binding first if you haven't: Installation.
TypeScript users: @kreuzberg/node for Node.js, @kreuzberg/wasm for browsers and edge runtimes — see Language Support.
Your First Extraction¶
Pass a file path to get its text content. Kreuzberg detects the format automatically:
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>
int main(void) {
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
KREUZBERGExtractionResult *result =
kreuzberg_extract_file_sync("document.pdf", NULL, config);
if (!result) {
fprintf(stderr, "extraction failed (code %d): %s\n",
kreuzberg_last_error_code(),
kreuzberg_last_error_context());
kreuzberg_extraction_config_free(config);
return 1;
}
char *content = kreuzberg_extraction_result_content(result);
printf("%s\n", content ? content : "(empty)");
kreuzberg_free_string(content);
kreuzberg_extraction_result_free(result);
kreuzberg_extraction_config_free(config);
return 0;
}
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
// Sync semantics — flutter_rust_bridge surfaces every call as a Future,
// so even the *Sync entrypoints must be awaited from Dart.
final result = await KreuzbergBridge.extractFileSync('document.pdf', null);
print(result.content);
print('MIME type: ${result.mimeType}');
print('Tables: ${result.tables.length}');
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);
System.out.println(result.content());
System.out.println("Tables: " + (result.tables() != null ? result.tables().size() : 0));
System.out.println("Metadata: " + result.metadata());
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
println("MIME type: ${result.mimeType()}")
println("Tables: ${result.tables()?.size ?: 0}")
}
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: true,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_sync('document.pdf', config: config)
puts "Extracted #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
puts "Quality score: #{result.quality_score}"
library(kreuzberg)
json <- extract_file_sync(
path = "document.pdf",
mime_type = "application/pdf",
config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file_sync("document.pdf", None, &config)?;
println!("{}", result.content);
println!("MIME type: {}", result.mime_type);
println!("Tables: {}", result.tables.len());
Ok(())
}
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json = "{}";
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
Handle Errors¶
Wrap extractions in error handling before going further. Kreuzberg raises specific exceptions for missing files, parse failures, and OCR problems:
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>
int main(void) {
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
/* Pass an unsupported MIME type to trigger an error. */
KREUZBERGExtractionResult *result =
kreuzberg_extract_bytes_sync(NULL, 0, "application/x-unknown", config);
if (!result) {
int32_t code = kreuzberg_last_error_code();
const char *message = kreuzberg_last_error_context();
/* message is valid until the next FFI call on this thread — copy if needed. */
fprintf(stderr, "error %d: %s\n", code, message ? message : "(no message)");
kreuzberg_extraction_config_free(config);
return code != 0 ? code : 1;
}
char *content = kreuzberg_extraction_result_content(result);
printf("%s\n", content ? content : "(empty)");
kreuzberg_free_string(content);
kreuzberg_extraction_result_free(result);
kreuzberg_extraction_config_free(config);
return 0;
}
using Kreuzberg;
try
{
var result = KreuzbergLib.ExtractFileSync("missing.pdf");
Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
Console.Error.WriteLine($"IO error: {ex.Message}");
throw;
}
catch (KreuzbergException ex)
{
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
throw;
}
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
try {
final result = await KreuzbergBridge.extractFile('document.pdf', null);
print(result.content);
} on Exception catch (e) {
// flutter_rust_bridge converts every KreuzbergError variant
// (Io / UnsupportedFormat / Parsing / MissingDependency, ...)
// into a Dart exception whose message preserves the original context.
print('Extraction failed: $e');
}
}
package main
import (
"errors"
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("missing.pdf", nil, kreuzberg.ExtractionConfig{})
if err != nil {
if errors.Is(err, kreuzberg.ErrIo) {
log.Printf("file not found: %v", err)
} else if errors.Is(err, kreuzberg.ErrUnsupportedFormat) {
log.Printf("unsupported format: %v", err)
} else {
log.Printf("extraction error: %v", err)
}
return
}
println("Content:", result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KreuzbergRsException;
import java.nio.file.Paths;
try {
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("missing.pdf"), config);
System.out.println(result.content());
} catch (KreuzbergRsException e) {
System.err.println("Extraction failed: " + e.getMessage());
System.err.println("Error code: " + e.getCode());
}
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
try {
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
println(result.content())
} catch (e: KreuzbergRsException) {
System.err.println("Extraction failed: ${e.message}")
System.err.println("Error code: ${e.code}")
} catch (e: Exception) {
System.err.println("Unexpected error: ${e.message}")
}
}
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
KreuzbergError,
ParsingError,
OCRError,
ValidationError,
)
try:
result = extract_file_sync("document.pdf")
print(f"Extracted {len(result.content)} characters")
except FileNotFoundError as e:
print(f"File not found: {e}")
except ParsingError as e:
print(f"Failed to parse document: {e}")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
try:
config: ExtractionConfig = ExtractionConfig()
pdf_bytes: bytes = b"%PDF-1.4\n"
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
print(f"Invalid configuration: {e}")
except OCRError as e:
print(f"OCR failed: {e}")
except KreuzbergError as e:
print(f"Extraction failed: {e}")
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('missing.pdf')
puts result.content
rescue RuntimeError => e
# All extraction errors are raised as RuntimeError
# Check error message for specific error details
case e.message
when /validation/i
puts "Validation error: #{e.message}"
when /io|not found/i
puts "IO error: #{e.message}"
raise
else
puts "Extraction failed: #{e.message}"
raise
end
end
library(kreuzberg)
content <- charToRaw("Hello, world!")
result <- tryCatch(
{
json <- extract_bytes_sync(
content = content,
mime_type = "application/x-nonexistent",
config = ExtractionConfig$default()
)
jsonlite::fromJSON(json, simplifyVector = FALSE)
},
error = function(e) {
message(sprintf("Extraction failed: %s", conditionMessage(e)))
NULL
}
)
if (is.null(result)) {
cat("No content extracted; falling back to original bytes\n")
} else {
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
}
use kreuzberg::{extract_file_sync, ExtractionConfig, KreuzbergError};
fn main() {
let config = ExtractionConfig::default();
match extract_file_sync("document.pdf", None, &config) {
Ok(result) => println!("{}", result.content),
Err(KreuzbergError::Io(e)) => eprintln!("File error: {e}"),
Err(KreuzbergError::UnsupportedFormat(mime)) => {
eprintln!("Unsupported format: {mime}");
}
Err(KreuzbergError::Parsing { message, .. }) => {
eprintln!("Corrupt or invalid document: {message}");
}
Err(KreuzbergError::MissingDependency(dep)) => {
eprintln!("Missing dependency — install {dep}");
}
Err(e) => eprintln!("Extraction failed: {e}"),
}
}
import Foundation
import Kreuzberg
import RustBridge
// The Swift binding throws `RustString` (not `KreuzbergError`) for every
// failure surfaced from the Rust core. The string preserves the original
// error variant name and message (e.g. "UnsupportedFormat: ...",
// "MissingDependency: ...", "Parsing: ...") so callers can pattern-match
// on the prefix or simply print the message.
do {
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
print(result.content().toString())
} catch let error as RustString {
let message = error.toString()
if message.contains("UnsupportedFormat") {
print("Unsupported format: \(message)")
} else if message.contains("MissingDependency") {
print("Install the required dependency: \(message)")
} else if message.contains("Parsing") {
print("Corrupt or invalid document: \(message)")
} else if message.contains("Io") {
print("File error: \(message)")
} else {
print("Extraction failed: \(message)")
}
} catch {
print("Unexpected error: \(error)")
}
# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Successfully extracted content")
IO.puts("Content length: #{byte_size(result.content)} characters")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")
case result do
{:ok, data} ->
IO.puts("File processed successfully")
{:error, error} ->
IO.puts("Error details: #{inspect(error)}")
end
# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
{:ok, result} ->
IO.puts("Content: #{result.content}")
{:error, msg} when is_binary(msg) ->
IO.puts("Validation error: #{msg}")
{:error, reason} ->
IO.puts("Unknown error: #{inspect(reason)}")
end
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
try {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
console.log(`Extracted: ${result.content.length} characters`);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.error("Extraction failed:", message);
}
}
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
const config_json = "{}";
const result_json = kreuzberg.extract_file_sync("document.pdf", null, config_json) catch |err| {
const stderr = std.io.getStdErr().writer();
switch (err) {
error.Io => try stderr.print("File error\n", .{}),
error.UnsupportedFormat => try stderr.print("Unsupported format\n", .{}),
error.Parsing => try stderr.print("Corrupt or invalid document\n", .{}),
error.MissingDependency => try stderr.print("Missing dependency — install required backend\n", .{}),
error.Ocr => try stderr.print("OCR processing failed\n", .{}),
error.OutOfMemory => try stderr.print("Out of memory\n", .{}),
else => try stderr.print("Extraction failed: {s}\n", .{@errorName(err)}),
}
if (kreuzberg._last_error()) |context| {
try stderr.print(" context: {s}\n", .{context});
}
return;
};
defer std.heap.c_allocator.free(result_json);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{result_json});
}
OCR for Scanned Documents¶
Kreuzberg runs OCR automatically when it detects an image or scanned PDF. You can also force OCR on any document:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct ConfigBuilder *builder = kreuzberg_config_builder_new();
kreuzberg_config_builder_set_ocr(builder,
"{\"tesseract\":{\"language\":\"eng\"}}");
ExtractionConfig *config = kreuzberg_config_builder_build(builder);
char *config_json = kreuzberg_config_to_json(config);
struct CExtractionResult *result =
kreuzberg_extract_file_sync_with_config("scanned.png", config_json);
if (result && result->success) {
printf("OCR text: %s\n", result->content);
} else {
fprintf(stderr, "OCR error: %s\n", kreuzberg_get_error_details().message);
}
kreuzberg_free_result(result);
kreuzberg_free_string(config_json);
kreuzberg_config_free(config);
return 0;
}
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final config = ExtractionConfig(
useCache: true,
enableQualityProcessing: true,
forceOcr: false,
disableOcr: false,
ocr: const OcrConfig(
enabled: true,
backend: 'tesseract',
language: 'eng',
autoRotate: false,
),
resultFormat: ResultFormat.unified,
outputFormat: OutputFormat.plain(),
includeDocumentStructure: false,
maxArchiveDepth: 3,
useLayoutForMarkdown: false,
);
final result = await KreuzbergBridge.extractFile('scanned.pdf', null, config);
print(result.content);
}
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
cfg := kreuzberg.ExtractionConfig{
Ocr: &kreuzberg.OcrConfig{
Backend: "tesseract",
Language: "eng",
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val ocr = OcrConfig.builder()
.withBackend("tesseract")
.withLanguage("eng")
.build()
val config = ExtractionConfig.builder()
.withOcr(Optional.of(ocr))
.build()
val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
println(result.content())
}
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng")
)
result = extract_file_sync("scanned.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
library(kreuzberg)
# Configure Tesseract OCR
config <- list(
force_ocr = TRUE,
ocr = list(backend = "tesseract", language = "eng")
)
# Extract text from a scanned image
json <- extract_file_sync("scan.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"}
}
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
content = result.content
IO.puts("OCR Extracted content:")
IO.puts(content)
IO.puts("Metadata: #{inspect(result.metadata)}")
import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
await enableOcr();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, {
ocr: {
backend: "kreuzberg-tesseract",
language: "eng",
},
});
console.log(result.content);
}
import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
await enableOcr(); // Uses native kreuzberg-tesseract backend
const result = await extractFile("./scanned_document.png", "image/png", {
ocr: {
backend: "kreuzberg-tesseract",
language: "eng",
},
});
console.log(result.content);
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json =
\\{
\\ "ocr": {
\\ "backend": "tesseract",
\\ "language": "eng"
\\ }
\\}
;
const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const owned = try allocator.dupe(u8, result_json);
defer allocator.free(owned);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{owned});
}
Process Multiple Files¶
Pass a list of paths to extract them in parallel:
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>
int main(void) {
/* Items is a JSON array of BatchFileItem objects.
* Each entry has a "path" field and an optional "config" override. */
const char *items_json =
"["
" {\"path\": \"doc1.pdf\"},"
" {\"path\": \"doc2.docx\"},"
" {\"path\": \"scan.png\", \"config\": {\"force_ocr\": true}}"
"]";
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
/* Returns a JSON array of ExtractionResult objects, or NULL on failure. */
char *results_json =
kreuzberg_batch_extract_files_sync(items_json, config);
if (!results_json) {
fprintf(stderr, "batch extraction failed (code %d): %s\n",
kreuzberg_last_error_code(),
kreuzberg_last_error_context());
kreuzberg_extraction_config_free(config);
return 1;
}
printf("%s\n", results_json);
kreuzberg_free_string(results_json);
kreuzberg_extraction_config_free(config);
return 0;
}
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final items = <BatchFileItem>[
const BatchFileItem(path: 'doc1.pdf'),
BatchFileItem(
path: 'scan.pdf',
config: FileExtractionConfig(forceOcr: true),
),
];
// Sync semantics — flutter_rust_bridge still returns a Future from Dart.
final results = await KreuzbergBridge.batchExtractFilesSync(items);
print('Processed ${results.length} files');
for (final result in results) {
print('${result.mimeType}: ${result.content.length} chars');
}
}
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
items := []kreuzberg.BatchFileItem{
{Path: "doc1.pdf"},
{Path: "doc2.docx"},
{Path: "doc3.pptx"},
}
results, err := kreuzberg.BatchExtractFilesSync(items, kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("batch extraction failed: %v", err)
}
for i, result := range results {
println("Doc", i, "content length:", len(result.Content))
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BatchFileItem;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
import java.util.List;
import java.util.Arrays;
List<BatchFileItem> items = Arrays.asList(
new BatchFileItem(Paths.get("doc1.pdf"), null),
new BatchFileItem(Paths.get("doc2.docx"), null),
new BatchFileItem(Paths.get("doc3.pptx"), null)
);
ExtractionConfig config = ExtractionConfig.builder().build();
List<ExtractionResult> results = Kreuzberg.batchExtractFilesSync(items, config);
for (ExtractionResult result : results) {
System.out.println("Content length: " + result.content().length());
}
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val items = listOf(
BatchFileItem(Paths.get("doc1.pdf"), null),
BatchFileItem(Paths.get("doc2.docx"), null),
BatchFileItem(Paths.get("report.pdf"), null),
)
val results = Kreuzberg.batchExtractFilesSync(items, config)
results.forEachIndexed { index, result ->
println("File $index: ${result.content().length} chars")
}
}
from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig
items = [
BatchFileItem(path="doc1.pdf"),
BatchFileItem(path="doc2.docx"),
BatchFileItem(path="doc3.html"),
]
results = batch_extract_files_sync(items, ExtractionConfig())
for i, result in enumerate(results):
print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
require 'kreuzberg'
items = [
Kreuzberg::BatchFileItem.new(path: 'doc1.pdf'),
Kreuzberg::BatchFileItem.new(path: 'doc2.docx'),
Kreuzberg::BatchFileItem.new(path: 'doc3.pptx')
]
config = Kreuzberg::ExtractionConfig.new(use_cache: true)
results = Kreuzberg.batch_extract_files_sync(items, config: config)
results.each_with_index do |result, idx|
puts "Document #{idx + 1}:"
puts " Extracted: #{result.content.length} characters"
puts " Quality: #{result.quality_score}"
puts " MIME: #{result.mime_type}"
end
library(kreuzberg)
items <- jsonlite::toJSON(list(
list(path = "report.pdf"),
list(path = "slides.pptx"),
list(path = "data.xlsx")
), auto_unbox = TRUE)
json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)
for (i in seq_along(results)) {
cat(sprintf("[%d] mime=%s chars=%d\n",
i, results[[i]]$mime_type, nchar(results[[i]]$content)))
}
use kreuzberg::{batch_extract_files_sync, BatchFileItem, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let items = vec![
BatchFileItem { path: "doc1.pdf".into(), config: None },
BatchFileItem { path: "doc2.docx".into(), config: None },
BatchFileItem { path: "report.pdf".into(), config: None },
];
let results = batch_extract_files_sync(items, &config)?;
for (i, result) in results.iter().enumerate() {
println!("File {}: {} chars", i, result.content.len());
}
Ok(())
}
import Foundation
import Kreuzberg
import RustBridge
// `BatchFileItem` is an opaque swift-bridge class with no public Swift
// constructor — build items from JSON via `batchFileItemFromJson`.
let items = RustVec<BatchFileItem>()
for path in ["doc1.pdf", "doc2.docx", "report.pdf"] {
let json = "{\"path\": \"\(path)\"}"
items.push(value: try batchFileItemFromJson(json))
}
let config = try extractionConfigFromJson("{}")
let results = try batchExtractFilesSync(items, config)
for (index, result) in results.enumerated() {
print("File \(index): \(result.content().toString().count) chars")
}
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
Enum.each(results, fn result ->
IO.puts("File: #{result.mime_type}")
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Tables: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total files processed: #{length(results)}")
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInputs = document.getElementById("files") as HTMLInputElement;
const files = Array.from(fileInputs.files || []);
const results = await Promise.all(files.map((file) => extractFromFile(file)));
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
// Batch items are passed as a JSON-encoded array across the FFI boundary.
const items_json =
\\[
\\ {"path": "doc1.pdf", "config": null},
\\ {"path": "doc2.docx", "config": null},
\\ {"path": "report.pdf", "config": null}
\\]
;
const config_json = "{}";
const results_json = try kreuzberg.batch_extract_files_sync(items_json, config_json);
defer std.heap.c_allocator.free(results_json);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{results_json});
}
Read Document Metadata¶
Every extraction result includes format-specific metadata — page count for PDFs, sheet names for Excel, dimensions for images:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct CExtractionResult *result = kreuzberg_extract_file_sync("document.pdf");
if (!result || !result->success) {
fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
return 1;
}
printf("Content: %s\n", result->content);
printf("MIME: %s\n", result->mime_type);
if (result->language)
printf("Language: %s\n", result->language);
if (result->date)
printf("Date: %s\n", result->date);
if (result->subject)
printf("Subject: %s\n", result->subject);
if (result->metadata_json)
printf("Metadata: %s\n", result->metadata_json);
kreuzberg_free_result(result);
return 0;
}
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig { ExtractMetadata = true }
};
var result = KreuzbergLib.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var pdfMeta = result.Metadata.Format.Pdf;
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
Console.WriteLine($"Author: {pdfMeta.Author}");
Console.WriteLine($"Title: {pdfMeta.Title}");
}
var htmlResult = KreuzbergLib.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
var htmlMeta = htmlResult.Metadata.Format.Html;
Console.WriteLine($"Title: {htmlMeta.Title}");
Console.WriteLine($"Description: {htmlMeta.Description}");
// Access keywords as array
if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.CanonicalUrl != null)
{
Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
}
// Access Open Graph fields from dictionary
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
{
if (htmlMeta.OpenGraph.ContainsKey("image"))
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
if (htmlMeta.OpenGraph.ContainsKey("title"))
Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
if (htmlMeta.OpenGraph.ContainsKey("type"))
Console.WriteLine($"Open Graph Type: {htmlMeta.OpenGraph["type"]}");
}
// Access Twitter Card fields from dictionary
if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
{
if (htmlMeta.TwitterCard.ContainsKey("card"))
Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
if (htmlMeta.TwitterCard.ContainsKey("creator"))
Console.WriteLine($"Twitter Creator: {htmlMeta.TwitterCard["creator"]}");
}
// Access new fields
if (htmlMeta.Language != null)
Console.WriteLine($"Language: {htmlMeta.Language}");
if (htmlMeta.TextDirection != null)
Console.WriteLine($"Text Direction: {htmlMeta.TextDirection}");
// Access headers
if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");
// Access links
if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
{
foreach (var link in htmlMeta.Links)
Console.WriteLine($"Link: {link.Href} ({link.Text})");
}
// Access images
if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");
// Access structured data
if (htmlMeta.StructuredData != null && htmlMeta.StructuredData.Count > 0)
Console.WriteLine($"Structured Data items: {htmlMeta.StructuredData.Count}");
}
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final result = await KreuzbergBridge.extractFile('document.pdf', null);
final metadata = result.metadata;
if (metadata.title != null) {
print('Title: ${metadata.title}');
}
if (metadata.subject != null) {
print('Subject: ${metadata.subject}');
}
if (metadata.authors != null) {
print('Authors: ${metadata.authors!.join(', ')}');
}
if (metadata.keywords != null) {
print('Keywords: ${metadata.keywords!.join(', ')}');
}
if (metadata.language != null) {
print('Language: ${metadata.language}');
}
if (metadata.createdAt != null) {
print('Created: ${metadata.createdAt}');
}
if (metadata.modifiedAt != null) {
print('Modified: ${metadata.modifiedAt}');
}
if (metadata.extractionDurationMs != null) {
print('Extraction took: ${metadata.extractionDurationMs} ms');
}
for (final entry in metadata.additional.entries) {
print('Additional[${entry.key}]: ${entry.value}');
}
}
package main
import (
"fmt"
"log"
"strings"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract pdf: %v", err)
}
// Access PDF metadata
if pdf, ok := result.Metadata.PdfMetadata(); ok {
if pdf.PageCount != nil {
fmt.Printf("Pages: %d\n", *pdf.PageCount)
}
if pdf.Author != nil {
fmt.Printf("Author: %s\n", *pdf.Author)
}
if pdf.Title != nil {
fmt.Printf("Title: %s\n", *pdf.Title)
}
}
// Access HTML metadata
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
if err != nil {
log.Fatalf("extract html: %v", err)
}
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
if html.Title != nil {
fmt.Printf("Title: %s\n", *html.Title)
}
if html.Description != nil {
fmt.Printf("Description: %s\n", *html.Description)
}
// Access keywords as array
if len(html.Keywords) > 0 {
fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
}
// Access canonical URL (renamed from canonical)
if html.CanonicalURL != nil {
fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
}
// Access Open Graph fields from map
if len(html.OpenGraph) > 0 {
if image, ok := html.OpenGraph["image"]; ok {
fmt.Printf("Open Graph Image: %s\n", image)
}
if ogTitle, ok := html.OpenGraph["title"]; ok {
fmt.Printf("Open Graph Title: %s\n", ogTitle)
}
if ogType, ok := html.OpenGraph["type"]; ok {
fmt.Printf("Open Graph Type: %s\n", ogType)
}
}
// Access Twitter Card fields from map
if len(html.TwitterCard) > 0 {
if card, ok := html.TwitterCard["card"]; ok {
fmt.Printf("Twitter Card Type: %s\n", card)
}
if creator, ok := html.TwitterCard["creator"]; ok {
fmt.Printf("Twitter Creator: %s\n", creator)
}
}
// Access new fields
if html.Language != nil {
fmt.Printf("Language: %s\n", *html.Language)
}
if html.TextDirection != nil {
fmt.Printf("Text Direction: %s\n", *html.TextDirection)
}
// Access headers
if len(html.Headers) > 0 {
headers := make([]string, len(html.Headers))
for i, h := range html.Headers {
headers[i] = h.Text
}
fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
}
// Access links
if len(html.Links) > 0 {
for _, link := range html.Links {
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
}
}
// Access images
if len(html.Images) > 0 {
for _, image := range html.Images {
fmt.Printf("Image: %s\n", image.Src)
}
}
// Access structured data
if len(html.StructuredData) > 0 {
fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Metadata;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
// Metadata is flat — format-specific fields are at the top level
Metadata metadata = result.getMetadata();
metadata.getTitle().ifPresent(t -> System.out.println("Title: " + t));
metadata.getAuthors().ifPresent(a -> System.out.println("Authors: " + String.join(", ", a)));
// Format-specific fields are in the additional map
Map<String, Object> extra = metadata.getAdditional();
if (extra.get("page_count") != null) {
System.out.println("Pages: " + extra.get("page_count"));
}
// Access HTML metadata
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
Metadata htmlMeta = htmlResult.getMetadata();
htmlMeta.getTitle().ifPresent(t -> System.out.println("Title: " + t));
Map<String, Object> htmlExtra = htmlMeta.getAdditional();
String description = (String) htmlExtra.get("description");
if (description != null) {
System.out.println("Description: " + description);
}
// Access keywords as array
htmlMeta.getKeywords().ifPresent(keywords ->
System.out.println("Keywords: " + keywords));
// Access canonical URL (renamed from canonical)
String canonicalUrl = (String) htmlExtra.get("canonical_url");
if (canonicalUrl != null) {
System.out.println("Canonical URL: " + canonicalUrl);
}
// Access Open Graph fields from map
@SuppressWarnings("unchecked")
Map<String, String> openGraph = (Map<String, String>) htmlExtra.get("open_graph");
if (openGraph != null) {
System.out.println("Open Graph Image: " + openGraph.get("image"));
System.out.println("Open Graph Title: " + openGraph.get("title"));
System.out.println("Open Graph Type: " + openGraph.get("type"));
}
// Access Twitter Card fields from map
@SuppressWarnings("unchecked")
Map<String, String> twitterCard = (Map<String, String>) htmlExtra.get("twitter_card");
if (twitterCard != null) {
System.out.println("Twitter Card Type: " + twitterCard.get("card"));
System.out.println("Twitter Creator: " + twitterCard.get("creator"));
}
// Access new fields
htmlMeta.getLanguage().ifPresent(l -> System.out.println("Language: " + l));
String textDirection = (String) htmlExtra.get("text_direction");
if (textDirection != null) {
System.out.println("Text Direction: " + textDirection);
}
// Access headers
@SuppressWarnings("unchecked")
List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlExtra.get("headers");
if (headers != null) {
headers.stream()
.map(h -> h.get("text"))
.forEach(text -> System.out.print(text + ", "));
System.out.println();
}
// Access links
@SuppressWarnings("unchecked")
List<Map<String, Object>> links = (List<Map<String, Object>>) htmlExtra.get("links");
if (links != null) {
for (Map<String, Object> link : links) {
System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
}
}
// Access images
@SuppressWarnings("unchecked")
List<Map<String, Object>> images = (List<Map<String, Object>>) htmlExtra.get("images");
if (images != null) {
for (Map<String, Object> image : images) {
System.out.println("Image: " + image.get("src"));
}
}
// Access structured data
@SuppressWarnings("unchecked")
List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlExtra.get("structured_data");
if (structuredData != null) {
System.out.println("Structured data items: " + structuredData.size());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val metadata = result.metadata()
metadata.title()?.let { println("Title: $it") }
metadata.authors()?.let { println("Authors: ${it.joinToString(", ")}") }
// Format-specific metadata via discriminated union
metadata.format()?.pdf()?.let { pdf ->
pdf.pageCount()?.let { println("Pages: $it") }
pdf.producer()?.let { println("Producer: $it") }
pdf.pdfVersion()?.let { println("PDF Version: $it") }
}
// Access HTML metadata
val htmlResult = Kreuzberg.extractFileSync(Paths.get("page.html"), null, config)
htmlResult.metadata().format()?.html()?.let { html ->
html.title()?.let { println("Title: $it") }
html.description()?.let { println("Description: $it") }
html.canonicalUrl()?.let { println("Canonical URL: $it") }
html.language()?.let { println("Language: $it") }
// Access keywords list
println("Keywords: ${html.keywords()}")
// Open Graph fields are exposed as a Map<String, String>
html.openGraph()["image"]?.let { println("Open Graph Image: $it") }
html.openGraph()["title"]?.let { println("Open Graph Title: $it") }
// Twitter Card fields as a Map<String, String>
html.twitterCard()["card"]?.let { println("Twitter Card Type: $it") }
// Headers
for (header in html.headers()) {
println("Header (level ${header.level()}): ${header.text()}")
}
// Links
for (link in html.links()) {
println("Link: ${link.href()} (${link.text()})")
}
// Images
for (image in html.images()) {
println("Image: ${image.src()}")
}
// Structured data
if (html.structuredData().isNotEmpty()) {
println("Structured data items: ${html.structuredData().size}")
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata.get("page_count"):
print(f"Pages: {metadata['page_count']}")
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("authors"):
print(f"Authors: {', '.join(metadata['authors'])}")
result = extract_file_sync("page.html", config=ExtractionConfig())
metadata = result.metadata
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("description"):
print(f"Description: {metadata['description']}")
# Access keywords as array
keywords = metadata.get('keywords', [])
if keywords:
print(f"Keywords: {', '.join(keywords)}")
# Access canonical URL (renamed from canonical)
canonical_url = metadata.get('canonical_url')
if canonical_url:
print(f"Canonical URL: {canonical_url}")
# Access Open Graph fields from map
open_graph = metadata.get('open_graph', {})
if open_graph:
if 'image' in open_graph:
print(f"Open Graph Image: {open_graph['image']}")
if 'title' in open_graph:
print(f"Open Graph Title: {open_graph['title']}")
if 'type' in open_graph:
print(f"Open Graph Type: {open_graph['type']}")
# Access Twitter Card fields from map
twitter_card = metadata.get('twitter_card', {})
if twitter_card:
if 'card' in twitter_card:
print(f"Twitter Card Type: {twitter_card['card']}")
if 'creator' in twitter_card:
print(f"Twitter Creator: {twitter_card['creator']}")
# Access new fields
language = metadata.get('language')
if language:
print(f"Language: {language}")
text_direction = metadata.get('text_direction')
if text_direction:
print(f"Text Direction: {text_direction}")
# Access headers
headers = metadata.get('headers', [])
if headers:
print(f"Headers: {', '.join([h['text'] for h in headers])}")
# Access links
links = metadata.get('links', [])
if links:
for link in links:
print(f"Link: {link.get('href')} ({link.get('text')})")
# Access images
images = metadata.get('images', [])
if images:
for image in images:
print(f"Image: {image.get('src')}")
# Access structured data
structured_data = metadata.get('structured_data', [])
if structured_data:
print(f"Structured data items: {len(structured_data)}")
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata['page_count']
puts "Pages: #{metadata['page_count']}"
end
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['authors']
puts "Authors: #{metadata['authors'].join(', ')}"
end
# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
metadata = html_result.metadata
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['description']
puts "Description: #{metadata['description']}"
end
# Access keywords as array
if metadata['keywords']
puts "Keywords: #{metadata['keywords'].join(', ')}"
end
# Access canonical URL (renamed from canonical)
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']
# Access Open Graph fields from map
open_graph = metadata['open_graph'] || {}
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
# Access Twitter Card fields from map
twitter_card = metadata['twitter_card'] || {}
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
# Access new fields
puts "Language: #{metadata['language']}" if metadata['language']
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']
# Access headers
if metadata['headers']
puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
end
# Access links
if metadata['links']
metadata['links'].each do |link|
puts "Link: #{link['href']} (#{link['text']})"
end
end
# Access images
if metadata['images']
metadata['images'].each do |image|
puts "Image: #{image['src']}"
end
end
# Access structured data
if metadata['structured_data']
puts "Structured data items: #{metadata['structured_data'].length}"
end
library(kreuzberg)
result <- extract_file_sync("document.pdf")
cat("Detected Language:", result$detected_language, "\n")
cat("Quality Score:", result$quality_score, "\n")
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")
cat("Metadata fields:\n")
authors <- metadata_field(result, "authors")
if (!is.null(authors)) {
cat("Authors:", paste(authors, collapse=", "), "\n")
}
created <- metadata_field(result, "created_date")
if (!is.null(created)) {
cat("Created Date:", created, "\n")
}
pages_meta <- metadata_field(result, "page_count")
if (!is.null(pages_meta)) {
cat("Pages:", pages_meta, "\n")
}
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
if let Some(pdf_meta) = result.metadata.pdf {
if let Some(pages) = pdf_meta.page_count {
println!("Pages: {}", pages);
}
if let Some(author) = pdf_meta.author {
println!("Author: {}", author);
}
if let Some(title) = pdf_meta.title {
println!("Title: {}", title);
}
}
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
if let Some(html_meta) = html_result.metadata.html {
if let Some(title) = html_meta.title {
println!("Title: {}", title);
}
if let Some(desc) = html_meta.description {
println!("Description: {}", desc);
}
// Access keywords array
println!("Keywords: {:?}", html_meta.keywords);
// Access canonical URL (renamed from canonical)
if let Some(canonical) = html_meta.canonical_url {
println!("Canonical URL: {}", canonical);
}
// Access Open Graph fields as a map
if let Some(og_image) = html_meta.open_graph.get("image") {
println!("Open Graph Image: {}", og_image);
}
if let Some(og_title) = html_meta.open_graph.get("title") {
println!("Open Graph Title: {}", og_title);
}
// Access Twitter Card fields as a map
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
println!("Twitter Card Type: {}", twitter_card);
}
// Access new fields
if let Some(lang) = html_meta.language {
println!("Language: {}", lang);
}
// Access headers
if !html_meta.headers.is_empty() {
for header in &html_meta.headers {
println!("Header (level {}): {}", header.level, header.text);
}
}
// Access links
if !html_meta.links.is_empty() {
for link in &html_meta.links {
println!("Link: {} ({})", link.href, link.text);
}
}
// Access images
if !html_meta.images.is_empty() {
for image in &html_meta.images {
println!("Image: {}", image.src);
}
}
// Access structured data
if !html_meta.structured_data.is_empty() {
println!("Structured data items: {}", html_meta.structured_data.len());
}
}
Ok(())
}
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
let metadata = result.metadata()
if let title = metadata.title() {
print("Title: \(title.toString())")
}
if let subject = metadata.subject() {
print("Subject: \(subject.toString())")
}
if let language = metadata.language() {
print("Language: \(language.toString())")
}
if let createdAt = metadata.created_at() {
print("Created at: \(createdAt.toString())")
}
if let modifiedAt = metadata.modified_at() {
print("Modified at: \(modifiedAt.toString())")
}
if let createdBy = metadata.created_by() {
print("Created by: \(createdBy.toString())")
}
if let authors = metadata.authors() {
let names = authors.map { $0.toString() }
print("Authors: \(names)")
}
if let keywords = metadata.keywords() {
let words = keywords.map { $0.toString() }
print("Keywords: \(words)")
}
if let duration = metadata.extraction_duration_ms() {
print("Extraction duration (ms): \(duration)")
}
if let pages = metadata.pages() {
print("Page count: \(pages.total_count())")
}
{:ok, result} = Kreuzberg.extract_file("document.pdf")
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
# Access PDF metadata directly from the flat map
page_count = metadata["page_count"]
if page_count, do: IO.puts("Page count: #{page_count}")
authors = metadata["authors"] || []
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
title = metadata["title"]
if title, do: IO.puts("Title: #{title}")
# Access HTML metadata directly from the flat map
{:ok, html_result} = Kreuzberg.extract_file("page.html")
html_meta = html_result.metadata
keywords = html_meta["keywords"] || []
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
description = html_meta["description"]
if description, do: IO.puts("Description: #{description}")
import { extractFileSync } from "@kreuzberg/node";
const result = extractFileSync("document.pdf");
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.pageCount) {
console.log(`Pages: ${result.metadata.pageCount}`);
}
const htmlResult = extractFileSync("page.html");
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
const htmlMeta = htmlResult.metadata;
if (htmlMeta.title) {
console.log(`Title: ${htmlMeta.title}`);
}
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.canonicalUrl) {
console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
}
// Access Open Graph fields from map
if (htmlMeta.openGraph) {
if (htmlMeta.openGraph["image"]) {
console.log(`Open Graph Image: ${htmlMeta.openGraph["image"]}`);
}
if (htmlMeta.openGraph["title"]) {
console.log(`Open Graph Title: ${htmlMeta.openGraph["title"]}`);
}
if (htmlMeta.openGraph["type"]) {
console.log(`Open Graph Type: ${htmlMeta.openGraph["type"]}`);
}
}
// Access Twitter Card fields from map
if (htmlMeta.twitterCard) {
if (htmlMeta.twitterCard["card"]) {
console.log(`Twitter Card Type: ${htmlMeta.twitterCard["card"]}`);
}
if (htmlMeta.twitterCard["creator"]) {
console.log(`Twitter Creator: ${htmlMeta.twitterCard["creator"]}`);
}
}
// Access new fields
if (htmlMeta.language) {
console.log(`Language: ${htmlMeta.language}`);
}
if (htmlMeta.textDirection) {
console.log(`Text Direction: ${htmlMeta.textDirection}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map((h) => h.text).join(", ")}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
}
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
// Access common metadata fields
if (result.metadata.title) {
console.log(`Title: ${result.metadata.title}`);
}
// Access format-specific metadata
const metadata = result.metadata;
// For HTML files
if (metadata.html) {
const htmlMeta = metadata.html;
console.log(`HTML Title: ${htmlMeta.title}`);
console.log(`Description: ${htmlMeta.description}`);
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
}
// Access canonical URL
if (htmlMeta.canonical_url) {
console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
}
// Access Open Graph fields
if (htmlMeta.open_graph) {
if (htmlMeta.open_graph["title"]) {
console.log(`OG Title: ${htmlMeta.open_graph["title"]}`);
}
if (htmlMeta.open_graph["image"]) {
console.log(`OG Image: ${htmlMeta.open_graph["image"]}`);
}
}
// Access Twitter Card fields
if (htmlMeta.twitter_card && htmlMeta.twitter_card["card"]) {
console.log(`Twitter Card Type: ${htmlMeta.twitter_card["card"]}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(", ")}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link: any) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image: any) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
}
}
// PDF-specific fields are at the top level of metadata
if (metadata.pageCount) {
console.log(`Pages: ${metadata.pageCount}`);
}
if (metadata.authors && metadata.authors.length > 0) {
console.log(`Authors: ${metadata.authors.join(", ")}`);
}
}
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json = "{}";
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
defer parsed.deinit();
const root = parsed.value;
if (root != .object) return;
const stdout = std.io.getStdOut().writer();
if (root.object.get("metadata")) |metadata_val| {
if (metadata_val != .object) return;
const metadata = metadata_val.object;
if (metadata.get("title")) |title_val| {
if (title_val == .string) {
try stdout.print("Title: {s}\n", .{title_val.string});
}
}
if (metadata.get("authors")) |authors_val| {
if (authors_val == .array) {
for (authors_val.array.items) |author| {
if (author == .string) {
try stdout.print("Author: {s}\n", .{author.string});
}
}
}
}
if (metadata.get("language")) |language_val| {
if (language_val == .string) {
try stdout.print("Language: {s}\n", .{language_val.string});
}
}
if (metadata.get("created_at")) |created_val| {
if (created_val == .string) {
try stdout.print("Created: {s}\n", .{created_val.string});
}
}
if (metadata.get("pages")) |pages_val| {
if (pages_val == .object) {
if (pages_val.object.get("total_count")) |total_val| {
if (total_val == .integer) {
try stdout.print("Pages: {d}\n", .{total_val.integer});
}
}
}
}
}
}
Extract and parse metadata using JSON output:
# Extract with metadata (JSON format includes metadata automatically)
kreuzberg extract document.pdf --format json
# Save to file and parse metadata
kreuzberg extract document.pdf --format json > result.json
# Print all metadata fields
cat result.json | jq '.metadata'
# Extract HTML metadata
kreuzberg extract page.html --format json | jq '.metadata'
# Get specific fields
kreuzberg extract document.pdf --format json | \
jq '.metadata | {page_count, authors, title}'
# Process multiple files
kreuzberg batch documents/*.pdf --format json > all_metadata.json
JSON Output Structure:
Kreuzberg extracts format-specific metadata for:
- PDF: page count, title, authors (list), creation date, modification date
- HTML: SEO tags, Open Graph, Twitter Card, structured data, headers, links, images
- Excel: sheet count, sheet names
- Email: from, to, CC, BCC, message ID, attachments
- PowerPoint: title, author, description, fonts
- Images: dimensions, format, EXIF data
- Archives: format, file count, file list, sizes
- XML: element count, unique elements
- Text/Markdown: word count, line count, headers, links
See Types Reference for complete metadata reference.
Extract Tables¶
Tables come back as both structured cells and Markdown. Kreuzberg extracts them from PDFs, spreadsheets, and HTML:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct CExtractionResult *result = kreuzberg_extract_file_sync("spreadsheet.xlsx");
if (!result || !result->success) {
fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
return 1;
}
if (result->tables_json) {
printf("Tables (JSON): %s\n", result->tables_json);
} else {
printf("No tables found\n");
}
kreuzberg_free_result(result);
return 0;
}
using Kreuzberg;
var result = KreuzbergLib.ExtractFileSync("document.pdf", new ExtractionConfig());
foreach (var table in result.Tables)
{
Console.WriteLine($"Table with {table.Cells.Count} rows");
Console.WriteLine(table.Markdown);
foreach (var row in table.Cells)
{
Console.WriteLine(string.Join(" | ", row));
}
}
import 'package:kreuzberg/kreuzberg.dart';
Future<void> main() async {
final result = await KreuzbergBridge.extractFile('document.pdf', null);
for (final table in result.tables) {
print('Table on page ${table.pageNumber} with ${table.cells.length} rows');
print(table.markdown);
for (final row in table.cells) {
print(row);
}
if (table.boundingBox != null) {
print('Bounding box: ${table.boundingBox}');
}
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Iterate over tables
for _, table := range result.Tables {
fmt.Printf("Table with %d rows\n", len(table.Cells))
fmt.Println(table.Markdown) // Markdown representation
// Access cells
for _, row := range table.Cells {
fmt.Println(row)
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
for (Table table : result.getTables()) {
System.out.println("Table with " + table.cells().size() + " rows");
System.out.println(table.markdown());
for (List<String> row : table.cells()) {
System.out.println(row);
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
import dev.kreuzberg.*
import java.nio.file.Paths
fun main() {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
val tables = result.tables() ?: emptyList()
for (table in tables) {
println("Table on page ${table.pageNumber()} with ${table.cells().size} rows")
println(table.markdown())
for (row in table.cells()) {
println(row)
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
result = extract_file_sync("document.pdf", config=ExtractionConfig())
for table in result.tables:
row_count: int = len(table.cells)
print(f"Table with {row_count} rows")
print(table.markdown)
for row in table.cells:
print(row)
library(kreuzberg)
result <- extract_file_sync("spreadsheet.xlsx")
cat("Tables extracted:", length(result$tables), "\n\n")
for (i in seq_along(result$tables)) {
table <- result$tables[[i]]
cat(sprintf("Table %d:\n", i))
cat(" Rows:", nrow(table), "\n")
cat(" Columns:", ncol(table), "\n")
cat(" Column names:", paste(colnames(table), collapse=", "), "\n")
cat("\n")
if (nrow(table) > 0L) {
cat(" Preview (first 3 rows):\n")
print(head(table, 3L))
cat("\n")
}
}
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
for table in &result.tables {
println!("Table with {} rows", table.cells.len());
println!("{}", table.markdown);
for row in &table.cells {
println!("{:?}", row);
}
}
Ok(())
}
import Foundation
import Kreuzberg
import RustBridge
let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)
let tables = result.tables()
print("Tables: \(tables.count)")
for (index, table) in tables.enumerated() {
print("Table \(index) on page \(table.page_number())")
print(table.markdown().toString())
if let bbox = table.bounding_box() {
print(" Bounding box: \(bbox.toString())")
}
}
{:ok, result} = Kreuzberg.extract_file("document.pdf")
tables = result.tables
IO.puts("Total tables found: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
IO.puts("\n--- Table #{index} ---")
# Access table cells
cells = table["cells"] || []
IO.puts("Rows: #{length(cells)}")
# Access table markdown representation
markdown = table["markdown"]
IO.puts("Markdown representation:")
IO.puts(markdown)
end)
import init, { extractBytes } from "kreuzberg-wasm";
await init();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const bytes = new Uint8Array(await file.arrayBuffer());
const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
result.tables?.forEach((table) => {
console.log(`Table with ${table.cells?.length ?? 0} rows`);
if (table.markdown) {
console.log(table.markdown);
}
table.cells?.forEach((row) => console.log(row.join(" | ")));
});
}
const std = @import("std");
const kreuzberg = @import("kreuzberg");
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const allocator = gpa.allocator();
const config_json = "{}";
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
defer parsed.deinit();
const root = parsed.value;
if (root != .object) return;
const stdout = std.io.getStdOut().writer();
const tables_val = root.object.get("tables") orelse return;
if (tables_val != .array) return;
for (tables_val.array.items) |table| {
if (table != .object) continue;
if (table.object.get("cells")) |cells_val| {
if (cells_val == .array) {
try stdout.print("Table with {d} rows\n", .{cells_val.array.items.len});
for (cells_val.array.items) |row_val| {
if (row_val != .array) continue;
try stdout.print(" Row:", .{});
for (row_val.array.items) |cell_val| {
if (cell_val == .string) {
try stdout.print(" [{s}]", .{cell_val.string});
}
}
try stdout.print("\n", .{});
}
}
}
if (table.object.get("markdown")) |markdown_val| {
if (markdown_val == .string) {
try stdout.print("{s}\n", .{markdown_val.string});
}
}
if (table.object.get("page_number")) |page_val| {
if (page_val == .integer) {
try stdout.print("Page: {d}\n", .{page_val.integer});
}
}
}
}
Extract and process tables from documents:
# Extract with JSON format (includes tables when detected)
kreuzberg extract document.pdf --format json
# Save tables to JSON
kreuzberg extract spreadsheet.xlsx --format json > tables.json
# Extract and parse table markdown
kreuzberg extract document.pdf --format json | \
jq '.tables[]? | .markdown'
# Get table cells
kreuzberg extract document.pdf --format json | \
jq '.tables[]? | .cells'
# Batch extract tables from multiple files
kreuzberg batch documents/**/*.pdf --format json > all_tables.json
JSON Table Structure:
Going Async¶
Use async extraction in web servers, background workers, or anywhere you need non-blocking I/O:
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>
/* kreuzberg_extract_file schedules work on the global Tokio runtime and
* returns once extraction is complete. For true non-blocking use, call it
* from a dedicated OS thread and synchronize via a semaphore or callback. */
int main(void) {
KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();
KREUZBERGExtractionResult *result =
kreuzberg_extract_file("document.pdf", NULL, config);
if (!result) {
fprintf(stderr, "extraction failed (code %d): %s\n",
kreuzberg_last_error_code(),
kreuzberg_last_error_context());
kreuzberg_extraction_config_free(config);
return 1;
}
char *content = kreuzberg_extraction_result_content(result);
printf("%s\n", content ? content : "(empty)");
kreuzberg_free_string(content);
kreuzberg_extraction_result_free(result);
kreuzberg_extraction_config_free(config);
return 0;
}
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/v5"
)
func main() {
result, err := kreuzberg.ExtractFile("document.pdf", nil, kreuzberg.ExtractionConfig{})
if err != nil {
log.Fatalf("extraction failed: %v", err)
}
println("Content:", result.Content)
println("MIME type:", result.MimeType)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFile(Paths.get("document.pdf"), config);
System.out.println(result.content());
System.out.println(result.mimeType());
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import kotlinx.coroutines.runBlocking
import java.nio.file.Paths
fun main() = runBlocking {
val config = ExtractionConfig.builder().build()
val result = Kreuzberg.extractFile(Paths.get("document.pdf"), null, config)
println(result.content())
println("MIME type: ${result.mimeType()}")
println("Tables: ${result.tables()?.size ?: 0}")
}
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
result = await extract_file("document.pdf", config=ExtractionConfig())
print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
asyncio.run(main())
require 'kreuzberg'
config = Kreuzberg::ExtractionConfig.new(
use_cache: false,
enable_quality_processing: true
)
result = Kreuzberg.extract_file_async('document.pdf', config: config)
puts "Async extraction complete"
puts "Extracted #{result.content.length} characters"
puts "Quality: #{result.quality_score}"
library(kreuzberg)
# extract_file is the async variant; extendr drives the tokio runtime so the
# call returns once extraction completes. R has no native async, so wrap with
# the future/promises packages if non-blocking dispatch is required.
json <- extract_file(
path = "document.pdf",
mime_type = "application/pdf",
config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)
cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
use kreuzberg::{extract_file, ExtractionConfig};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig::default();
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("{}", result.content);
println!("MIME type: {}", result.mime_type);
println!("Tables: {}", result.tables.len());
Ok(())
}
import Foundation
import Kreuzberg
import RustBridge
@main
struct App {
static func main() async throws {
let config = try extractionConfigFromJson("{}")
// The Swift binding exposes async-compatible entrypoints; even though
// the bridge calls are synchronous internally, callers may `await` them
// to integrate with Swift Concurrency.
let result = try await extractFile("document.pdf", nil, config)
print(result.content().toString())
print("MIME type: \(result.mime_type().toString())")
print("Tables: \(result.tables().count)")
}
}
task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
import { extractFromFile, initWasm } from "@kreuzberg/wasm";
await initWasm();
const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
const content = result.content;
const tableCount = result.tables.length;
console.log(`Content length: ${content.length} characters`);
console.log(`Tables: ${tableCount}`);
}
const std = @import("std");
const kreuzberg = @import("kreuzberg");
// Note: the Zig binding is sync-only. There is no `extract_file` async variant —
// the FFI surface exposes blocking entry points that internally drive the global
// Tokio runtime. Use `extract_file_sync` from any thread.
pub fn main() !void {
const config_json = "{}";
const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
defer std.heap.c_allocator.free(result_json);
const stdout = std.io.getStdOut().writer();
try stdout.print("{s}\n", .{result_json});
}
Not Applicable
Async extraction is an API-level feature. The CLI operates synchronously. Use language-specific bindings (Python, TypeScript, Rust, WASM) for async operations.
Next Steps¶
You've covered the core API. Go deeper:
- Configuration Guide — OCR backends, chunking, language detection, config files
- Extract from Bytes — Process in-memory data without writing to disk
- OCR Setup — Tesseract, PaddleOCR, EasyOCR backends
- Types Reference — Full metadata fields for every format
- Docker Deployment — Run Kreuzberg in containers
- API Reference — Complete API documentation