Skip to content

Quick Start

This guide walks you through Kreuzberg's core API — extracting text, handling errors, running OCR, and working with metadata. Install your binding first if you haven't: Installation.

TypeScript users: @kreuzberg/node for Node.js, @kreuzberg/wasm for browsers and edge runtimes — see Language Support.

Your First Extraction

Pass a file path to get its text content. Kreuzberg detects the format automatically:

C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    KREUZBERGExtractionResult *result =
        kreuzberg_extract_file_sync("document.pdf", NULL, config);
    if (!result) {
        fprintf(stderr, "extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
C#
using Kreuzberg;

var result = KreuzbergLib.ExtractFileSync("document.pdf", new ExtractionConfig());

Console.WriteLine(result.Content);
Console.WriteLine($"Tables: {result.Tables.Count}");
Console.WriteLine($"Metadata: {result.Metadata.FormatType}");
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  // Sync semantics — flutter_rust_bridge surfaces every call as a Future,
  // so even the *Sync entrypoints must be awaited from Dart.
  final result = await KreuzbergBridge.extractFileSync('document.pdf', null);

  print(result.content);
  print('MIME type: ${result.mimeType}');
  print('Tables: ${result.tables.length}');
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("extraction failed: %v", err)
    }

    println("Content:", result.Content)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;

ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);

System.out.println(result.content());
System.out.println("Tables: " + (result.tables() != null ? result.tables().size() : 0));
System.out.println("Metadata: " + result.metadata());
Kotlin
import dev.kreuzberg.*
import java.nio.file.Paths

fun main() {
    val config = ExtractionConfig.builder().build()
    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)

    println(result.content())
    println("MIME type: ${result.mimeType()}")
    println("Tables: ${result.tables()?.size ?: 0}")
}
Python
from kreuzberg import extract_file_sync, ExtractionConfig

result = extract_file_sync("document.pdf", config=ExtractionConfig())

print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  use_cache: true,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)

puts "Extracted #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
puts "Quality score: #{result.quality_score}"
R
library(kreuzberg)

json <- extract_file_sync(
  path = "document.pdf",
  mime_type = "application/pdf",
  config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let result = extract_file_sync("document.pdf", None, &config)?;

    println!("{}", result.content);
    println!("MIME type: {}", result.mime_type);
    println!("Tables: {}", result.tables.len());
    Ok(())
}
Swift
import Foundation
import Kreuzberg
import RustBridge

let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)

print(result.content().toString())
print("MIME type: \(result.mime_type().toString())")
print("Tables: \(result.tables().count)")
Elixir
{:ok, result} = Kreuzberg.extract_file("document.pdf")

content = result.content
table_count = length(result.tables)
metadata = result.metadata

IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
TypeScript
import { extractFileSync } from "@kreuzberg/node";

const result = extractFileSync("document.pdf");

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
WASM
import { extractFromFile, initWasm } from "@kreuzberg/wasm";

await initWasm();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  const result = await extractFromFile(file);
  console.log(result.content);
  console.log(`Tables: ${result.tables.length}`);
  console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const config_json = "{}";
    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
    defer std.heap.c_allocator.free(result_json);

    const owned = try allocator.dupe(u8, result_json);
    defer allocator.free(owned);

    const stdout = std.io.getStdOut().writer();
    try stdout.print("{s}\n", .{owned});
}
Bash
# Extract to stdout
kreuzberg extract document.pdf

# Save to file using shell redirection
kreuzberg extract document.pdf > output.txt

# Extract with JSON format (includes metadata)
kreuzberg extract document.pdf --format json

Handle Errors

Wrap extractions in error handling before going further. Kreuzberg raises specific exceptions for missing files, parse failures, and OCR problems:

C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    /* Pass an unsupported MIME type to trigger an error. */
    KREUZBERGExtractionResult *result =
        kreuzberg_extract_bytes_sync(NULL, 0, "application/x-unknown", config);
    if (!result) {
        int32_t code = kreuzberg_last_error_code();
        const char *message = kreuzberg_last_error_context();
        /* message is valid until the next FFI call on this thread — copy if needed. */
        fprintf(stderr, "error %d: %s\n", code, message ? message : "(no message)");
        kreuzberg_extraction_config_free(config);
        return code != 0 ? code : 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
C#
using Kreuzberg;

try
{
    var result = KreuzbergLib.ExtractFileSync("missing.pdf");
    Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
    Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
    Console.Error.WriteLine($"IO error: {ex.Message}");
    throw;
}
catch (KreuzbergException ex)
{
    Console.Error.WriteLine($"Extraction failed: {ex.Message}");
    throw;
}
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  try {
    final result = await KreuzbergBridge.extractFile('document.pdf', null);
    print(result.content);
  } on Exception catch (e) {
    // flutter_rust_bridge converts every KreuzbergError variant
    // (Io / UnsupportedFormat / Parsing / MissingDependency, ...)
    // into a Dart exception whose message preserves the original context.
    print('Extraction failed: $e');
  }
}
Go
package main

import (
    "errors"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("missing.pdf", nil, kreuzberg.ExtractionConfig{})
    if err != nil {
        if errors.Is(err, kreuzberg.ErrIo) {
            log.Printf("file not found: %v", err)
        } else if errors.Is(err, kreuzberg.ErrUnsupportedFormat) {
            log.Printf("unsupported format: %v", err)
        } else {
            log.Printf("extraction error: %v", err)
        }
        return
    }

    println("Content:", result.Content)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KreuzbergRsException;
import java.nio.file.Paths;

try {
    ExtractionConfig config = ExtractionConfig.builder().build();
    ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("missing.pdf"), config);
    System.out.println(result.content());
} catch (KreuzbergRsException e) {
    System.err.println("Extraction failed: " + e.getMessage());
    System.err.println("Error code: " + e.getCode());
}
Kotlin
import dev.kreuzberg.*
import java.nio.file.Paths

fun main() {
    val config = ExtractionConfig.builder().build()
    try {
        val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)
        println(result.content())
    } catch (e: KreuzbergRsException) {
        System.err.println("Extraction failed: ${e.message}")
        System.err.println("Error code: ${e.code}")
    } catch (e: Exception) {
        System.err.println("Unexpected error: ${e.message}")
    }
}
Python
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
    KreuzbergError,
    ParsingError,
    OCRError,
    ValidationError,
)

try:
    result = extract_file_sync("document.pdf")
    print(f"Extracted {len(result.content)} characters")
except FileNotFoundError as e:
    print(f"File not found: {e}")
except ParsingError as e:
    print(f"Failed to parse document: {e}")
except OCRError as e:
    print(f"OCR processing failed: {e}")
except KreuzbergError as e:
    print(f"Extraction error: {e}")

try:
    config: ExtractionConfig = ExtractionConfig()
    pdf_bytes: bytes = b"%PDF-1.4\n"
    result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
    print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
    print(f"Invalid configuration: {e}")
except OCRError as e:
    print(f"OCR failed: {e}")
except KreuzbergError as e:
    print(f"Extraction failed: {e}")
Ruby
require 'kreuzberg'

begin
  result = Kreuzberg.extract_file_sync('missing.pdf')
  puts result.content
rescue RuntimeError => e
  # All extraction errors are raised as RuntimeError
  # Check error message for specific error details
  case e.message
  when /validation/i
    puts "Validation error: #{e.message}"
  when /io|not found/i
    puts "IO error: #{e.message}"
    raise
  else
    puts "Extraction failed: #{e.message}"
    raise
  end
end
R
library(kreuzberg)

content <- charToRaw("Hello, world!")

result <- tryCatch(
  {
    json <- extract_bytes_sync(
      content = content,
      mime_type = "application/x-nonexistent",
      config = ExtractionConfig$default()
    )
    jsonlite::fromJSON(json, simplifyVector = FALSE)
  },
  error = function(e) {
    message(sprintf("Extraction failed: %s", conditionMessage(e)))
    NULL
  }
)

if (is.null(result)) {
  cat("No content extracted; falling back to original bytes\n")
} else {
  cat(sprintf("Extracted %d characters\n", nchar(result$content)))
}
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig, KreuzbergError};

fn main() {
    let config = ExtractionConfig::default();
    match extract_file_sync("document.pdf", None, &config) {
        Ok(result) => println!("{}", result.content),
        Err(KreuzbergError::Io(e)) => eprintln!("File error: {e}"),
        Err(KreuzbergError::UnsupportedFormat(mime)) => {
            eprintln!("Unsupported format: {mime}");
        }
        Err(KreuzbergError::Parsing { message, .. }) => {
            eprintln!("Corrupt or invalid document: {message}");
        }
        Err(KreuzbergError::MissingDependency(dep)) => {
            eprintln!("Missing dependency — install {dep}");
        }
        Err(e) => eprintln!("Extraction failed: {e}"),
    }
}
Swift
import Foundation
import Kreuzberg
import RustBridge

// The Swift binding throws `RustString` (not `KreuzbergError`) for every
// failure surfaced from the Rust core. The string preserves the original
// error variant name and message (e.g. "UnsupportedFormat: ...",
// "MissingDependency: ...", "Parsing: ...") so callers can pattern-match
// on the prefix or simply print the message.
do {
    let config = try extractionConfigFromJson("{}")
    let result = try extractFileSync("document.pdf", nil, config)
    print(result.content().toString())
} catch let error as RustString {
    let message = error.toString()
    if message.contains("UnsupportedFormat") {
        print("Unsupported format: \(message)")
    } else if message.contains("MissingDependency") {
        print("Install the required dependency: \(message)")
    } else if message.contains("Parsing") {
        print("Corrupt or invalid document: \(message)")
    } else if message.contains("Io") {
        print("File error: \(message)")
    } else {
        print("Extraction failed: \(message)")
    }
} catch {
    print("Unexpected error: \(error)")
}
Elixir
# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
  {:ok, result} ->
    IO.puts("Successfully extracted content")
    IO.puts("Content length: #{byte_size(result.content)} characters")

  {:error, reason} ->
    IO.puts("Extraction failed: #{reason}")
end

# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")

case result do
  {:ok, data} ->
    IO.puts("File processed successfully")
  {:error, error} ->
    IO.puts("Error details: #{inspect(error)}")
end

# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
  {:ok, result} ->
    IO.puts("Content: #{result.content}")
  {:error, msg} when is_binary(msg) ->
    IO.puts("Validation error: #{msg}")
  {:error, reason} ->
    IO.puts("Unknown error: #{inspect(reason)}")
end
TypeScript
import { extractFileSync } from "kreuzberg";

try {
  const result = extractFileSync("missing.pdf");
  console.log(result.content);
} catch (error: unknown) {
  if (error instanceof Error) {
    console.error(`Extraction failed: ${error.message}`);
  }
  throw error;
}
WASM
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  try {
    const bytes = new Uint8Array(await file.arrayBuffer());
    const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
    console.log(`Extracted: ${result.content.length} characters`);
  } catch (error) {
    const message = error instanceof Error ? error.message : String(error);
    console.error("Extraction failed:", message);
  }
}
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

pub fn main() !void {
    const config_json = "{}";
    const result_json = kreuzberg.extract_file_sync("document.pdf", null, config_json) catch |err| {
        const stderr = std.io.getStdErr().writer();
        switch (err) {
            error.Io => try stderr.print("File error\n", .{}),
            error.UnsupportedFormat => try stderr.print("Unsupported format\n", .{}),
            error.Parsing => try stderr.print("Corrupt or invalid document\n", .{}),
            error.MissingDependency => try stderr.print("Missing dependency — install required backend\n", .{}),
            error.Ocr => try stderr.print("OCR processing failed\n", .{}),
            error.OutOfMemory => try stderr.print("Out of memory\n", .{}),
            else => try stderr.print("Extraction failed: {s}\n", .{@errorName(err)}),
        }
        if (kreuzberg._last_error()) |context| {
            try stderr.print("  context: {s}\n", .{context});
        }
        return;
    };
    defer std.heap.c_allocator.free(result_json);

    const stdout = std.io.getStdOut().writer();
    try stdout.print("{s}\n", .{result_json});
}

OCR for Scanned Documents

Kreuzberg runs OCR automatically when it detects an image or scanned PDF. You can also force OCR on any document:

C
#include "kreuzberg.h"
#include <stdio.h>

int main(void) {
    struct ConfigBuilder *builder = kreuzberg_config_builder_new();
    kreuzberg_config_builder_set_ocr(builder,
        "{\"tesseract\":{\"language\":\"eng\"}}");
    ExtractionConfig *config = kreuzberg_config_builder_build(builder);

    char *config_json = kreuzberg_config_to_json(config);
    struct CExtractionResult *result =
        kreuzberg_extract_file_sync_with_config("scanned.png", config_json);

    if (result && result->success) {
        printf("OCR text: %s\n", result->content);
    } else {
        fprintf(stderr, "OCR error: %s\n", kreuzberg_get_error_details().message);
    }

    kreuzberg_free_result(result);
    kreuzberg_free_string(config_json);
    kreuzberg_config_free(config);
    return 0;
}
C#
using Kreuzberg;

var config = new ExtractionConfig
{
    ForceOcr = true,
    Ocr = new OcrConfig
    {
        Backend = "tesseract",
        Language = "eng",
    },
};

var result = KreuzbergLib.ExtractFileSync("scanned.pdf", config);
Console.WriteLine(result.Content);
Console.WriteLine(result.DetectedLanguages);
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  final config = ExtractionConfig(
    useCache: true,
    enableQualityProcessing: true,
    forceOcr: false,
    disableOcr: false,
    ocr: const OcrConfig(
      enabled: true,
      backend: 'tesseract',
      language: 'eng',
      autoRotate: false,
    ),
    resultFormat: ResultFormat.unified,
    outputFormat: OutputFormat.plain(),
    includeDocumentStructure: false,
    maxArchiveDepth: 3,
    useLayoutForMarkdown: false,
  );

  final result = await KreuzbergBridge.extractFile('scanned.pdf', null, config);
  print(result.content);
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

func main() {
    cfg := kreuzberg.ExtractionConfig{
        Ocr: &kreuzberg.OcrConfig{
            Backend:  "tesseract",
            Language: "eng",
        },
    }

    result, err := kreuzberg.ExtractFileSync("scanned.pdf", nil, cfg)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }
    log.Println(len(result.Content))
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.OcrConfig;
import java.io.IOException;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionConfig config = ExtractionConfig.builder()
                .ocr(OcrConfig.builder()
                    .backend("tesseract")
                    .language("eng")
                    .build())
                .build();

            ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
            System.out.println(result.getContent());
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}
Kotlin
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional

fun main() {
    val ocr = OcrConfig.builder()
        .withBackend("tesseract")
        .withLanguage("eng")
        .build()

    val config = ExtractionConfig.builder()
        .withOcr(Optional.of(ocr))
        .build()

    val result = Kreuzberg.extractFileSync(Paths.get("scanned.pdf"), null, config)
    println(result.content())
}
Python
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig

config: ExtractionConfig = ExtractionConfig(
    ocr=OcrConfig(backend="tesseract", language="eng")
)

result = extract_file_sync("scanned.pdf", config=config)

content: str = result.content
preview: str = content[:100]
total_length: int = len(content)

print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
Ruby
require 'kreuzberg'

ocr_config = Kreuzberg::OcrConfig.new(
  backend: 'tesseract',
  language: 'eng'
)

config = Kreuzberg::ExtractionConfig.new(ocr: ocr_config)
result = Kreuzberg.extract_file_sync('scanned.pdf', config: config)
puts result.content
R
library(kreuzberg)

# Configure Tesseract OCR
config <- list(
  force_ocr = TRUE,
  ocr = list(backend = "tesseract", language = "eng")
)

# Extract text from a scanned image
json <- extract_file_sync("scan.png", "image/png", config)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "eng".to_string(),
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file_sync("scanned.pdf", None, &config)?;
    println!("{}", result.content);
    Ok(())
}
Swift
import Foundation
import Kreuzberg
import RustBridge

let configJson = """
{
    "ocr": {
        "backend": "tesseract",
        "language": "eng"
    }
}
"""

let config = try extractionConfigFromJson(configJson)
let result = try extractFileSync("scanned.pdf", nil, config)

print(result.content().toString())
Elixir
alias Kreuzberg.ExtractionConfig

config = %ExtractionConfig{
  ocr: %{"enabled" => true, "backend" => "tesseract"}
}

{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)

content = result.content
IO.puts("OCR Extracted content:")
IO.puts(content)
IO.puts("Metadata: #{inspect(result.metadata)}")
TypeScript
import { extractFileSync } from "@kreuzberg/node";

const config = {
  ocr: {
    backend: "tesseract",
    language: "eng",
  },
};

const result = extractFileSync("scanned.pdf", null, config);
console.log(result.content);
WASM (Browser)
import { enableOcr, extractFromFile, initWasm } from "@kreuzberg/wasm";

await initWasm();
await enableOcr();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  const result = await extractFromFile(file, file.type, {
    ocr: {
      backend: "kreuzberg-tesseract",
      language: "eng",
    },
  });
  console.log(result.content);
}
WASM (Node.js / Deno / Bun)
import { enableOcr, extractFile, initWasm } from "@kreuzberg/wasm";

await initWasm();
await enableOcr(); // Uses native kreuzberg-tesseract backend

const result = await extractFile("./scanned_document.png", "image/png", {
  ocr: {
    backend: "kreuzberg-tesseract",
    language: "eng",
  },
});
console.log(result.content);
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const config_json =
        \\{
        \\  "ocr": {
        \\    "backend": "tesseract",
        \\    "language": "eng"
        \\  }
        \\}
    ;

    const result_json = try kreuzberg.extract_file_sync("scanned.pdf", null, config_json);
    defer std.heap.c_allocator.free(result_json);

    const owned = try allocator.dupe(u8, result_json);
    defer allocator.free(owned);

    const stdout = std.io.getStdOut().writer();
    try stdout.print("{s}\n", .{owned});
}
Bash
kreuzberg extract scanned.pdf --ocr true

Process Multiple Files

Pass a list of paths to extract them in parallel:

C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    /* Items is a JSON array of BatchFileItem objects.
     * Each entry has a "path" field and an optional "config" override. */
    const char *items_json =
        "["
        "  {\"path\": \"doc1.pdf\"},"
        "  {\"path\": \"doc2.docx\"},"
        "  {\"path\": \"scan.png\", \"config\": {\"force_ocr\": true}}"
        "]";

    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    /* Returns a JSON array of ExtractionResult objects, or NULL on failure. */
    char *results_json =
        kreuzberg_batch_extract_files_sync(items_json, config);
    if (!results_json) {
        fprintf(stderr, "batch extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    printf("%s\n", results_json);
    kreuzberg_free_string(results_json);
    kreuzberg_extraction_config_free(config);
    return 0;
}
C#
using Kreuzberg;

var files = new[] { "doc1.pdf", "doc2.docx", "doc3.pptx" };
var results = KreuzbergLib.BatchExtractFilesSync(files, new ExtractionConfig());

foreach (var result in results)
{
    Console.WriteLine($"Content length: {result.Content.Length}");
}
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  final items = <BatchFileItem>[
    const BatchFileItem(path: 'doc1.pdf'),
    BatchFileItem(
      path: 'scan.pdf',
      config: FileExtractionConfig(forceOcr: true),
    ),
  ];

  // Sync semantics — flutter_rust_bridge still returns a Future from Dart.
  final results = await KreuzbergBridge.batchExtractFilesSync(items);

  print('Processed ${results.length} files');
  for (final result in results) {
    print('${result.mimeType}: ${result.content.length} chars');
  }
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    items := []kreuzberg.BatchFileItem{
        {Path: "doc1.pdf"},
        {Path: "doc2.docx"},
        {Path: "doc3.pptx"},
    }

    results, err := kreuzberg.BatchExtractFilesSync(items, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("batch extraction failed: %v", err)
    }

    for i, result := range results {
        println("Doc", i, "content length:", len(result.Content))
    }
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BatchFileItem;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
import java.util.List;
import java.util.Arrays;

List<BatchFileItem> items = Arrays.asList(
    new BatchFileItem(Paths.get("doc1.pdf"), null),
    new BatchFileItem(Paths.get("doc2.docx"), null),
    new BatchFileItem(Paths.get("doc3.pptx"), null)
);

ExtractionConfig config = ExtractionConfig.builder().build();
List<ExtractionResult> results = Kreuzberg.batchExtractFilesSync(items, config);

for (ExtractionResult result : results) {
    System.out.println("Content length: " + result.content().length());
}
Kotlin
import dev.kreuzberg.*
import java.nio.file.Paths

fun main() {
    val config = ExtractionConfig.builder().build()
    val items = listOf(
        BatchFileItem(Paths.get("doc1.pdf"), null),
        BatchFileItem(Paths.get("doc2.docx"), null),
        BatchFileItem(Paths.get("report.pdf"), null),
    )
    val results = Kreuzberg.batchExtractFilesSync(items, config)

    results.forEachIndexed { index, result ->
        println("File $index: ${result.content().length} chars")
    }
}
Python
from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig

items = [
    BatchFileItem(path="doc1.pdf"),
    BatchFileItem(path="doc2.docx"),
    BatchFileItem(path="doc3.html"),
]

results = batch_extract_files_sync(items, ExtractionConfig())

for i, result in enumerate(results):
    print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
Ruby
require 'kreuzberg'

items = [
  Kreuzberg::BatchFileItem.new(path: 'doc1.pdf'),
  Kreuzberg::BatchFileItem.new(path: 'doc2.docx'),
  Kreuzberg::BatchFileItem.new(path: 'doc3.pptx')
]

config = Kreuzberg::ExtractionConfig.new(use_cache: true)

results = Kreuzberg.batch_extract_files_sync(items, config: config)

results.each_with_index do |result, idx|
  puts "Document #{idx + 1}:"
  puts "  Extracted: #{result.content.length} characters"
  puts "  Quality: #{result.quality_score}"
  puts "  MIME: #{result.mime_type}"
end
R
library(kreuzberg)

items <- jsonlite::toJSON(list(
  list(path = "report.pdf"),
  list(path = "slides.pptx"),
  list(path = "data.xlsx")
), auto_unbox = TRUE)

json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)

for (i in seq_along(results)) {
  cat(sprintf("[%d] mime=%s chars=%d\n",
              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
}
Rust
use kreuzberg::{batch_extract_files_sync, BatchFileItem, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let items = vec![
        BatchFileItem { path: "doc1.pdf".into(), config: None },
        BatchFileItem { path: "doc2.docx".into(), config: None },
        BatchFileItem { path: "report.pdf".into(), config: None },
    ];
    let results = batch_extract_files_sync(items, &config)?;

    for (i, result) in results.iter().enumerate() {
        println!("File {}: {} chars", i, result.content.len());
    }
    Ok(())
}
Swift
import Foundation
import Kreuzberg
import RustBridge

// `BatchFileItem` is an opaque swift-bridge class with no public Swift
// constructor — build items from JSON via `batchFileItemFromJson`.
let items = RustVec<BatchFileItem>()
for path in ["doc1.pdf", "doc2.docx", "report.pdf"] {
    let json = "{\"path\": \"\(path)\"}"
    items.push(value: try batchFileItemFromJson(json))
}

let config = try extractionConfigFromJson("{}")
let results = try batchExtractFilesSync(items, config)

for (index, result) in results.enumerated() {
    print("File \(index): \(result.content().toString().count) chars")
}
Elixir
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]

{:ok, results} = Kreuzberg.batch_extract_files(file_paths)

Enum.each(results, fn result ->
  IO.puts("File: #{result.mime_type}")
  IO.puts("Content length: #{byte_size(result.content)} characters")
  IO.puts("Tables: #{length(result.tables)}")
  IO.puts("---")
end)

IO.puts("Total files processed: #{length(results)}")
TypeScript
import { batchExtractFilesSync } from "@kreuzberg/node";

const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
const results = batchExtractFilesSync(files);

results.forEach((result, i) => {
  console.log(`File ${i + 1}: ${result.content.length} characters`);
});
WASM
import { extractFromFile, initWasm } from "@kreuzberg/wasm";

await initWasm();

const fileInputs = document.getElementById("files") as HTMLInputElement;
const files = Array.from(fileInputs.files || []);

const results = await Promise.all(files.map((file) => extractFromFile(file)));

results.forEach((result, i) => {
  console.log(`File ${i + 1}: ${result.content.length} characters`);
});
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

pub fn main() !void {
    // Batch items are passed as a JSON-encoded array across the FFI boundary.
    const items_json =
        \\[
        \\  {"path": "doc1.pdf", "config": null},
        \\  {"path": "doc2.docx", "config": null},
        \\  {"path": "report.pdf", "config": null}
        \\]
    ;
    const config_json = "{}";

    const results_json = try kreuzberg.batch_extract_files_sync(items_json, config_json);
    defer std.heap.c_allocator.free(results_json);

    const stdout = std.io.getStdOut().writer();
    try stdout.print("{s}\n", .{results_json});
}
Bash
# Process multiple files
kreuzberg extract doc1.pdf doc2.docx doc3.pptx

# Use glob patterns
kreuzberg extract documents/**/*.pdf

Read Document Metadata

Every extraction result includes format-specific metadata — page count for PDFs, sheet names for Excel, dimensions for images:

C
#include "kreuzberg.h"
#include <stdio.h>

int main(void) {
    struct CExtractionResult *result = kreuzberg_extract_file_sync("document.pdf");
    if (!result || !result->success) {
        fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
        return 1;
    }

    printf("Content: %s\n", result->content);
    printf("MIME: %s\n", result->mime_type);

    if (result->language)
        printf("Language: %s\n", result->language);
    if (result->date)
        printf("Date: %s\n", result->date);
    if (result->subject)
        printf("Subject: %s\n", result->subject);
    if (result->metadata_json)
        printf("Metadata: %s\n", result->metadata_json);

    kreuzberg_free_result(result);
    return 0;
}
C#
using Kreuzberg;

var config = new ExtractionConfig
{
    PdfOptions = new PdfConfig { ExtractMetadata = true }
};

var result = KreuzbergLib.ExtractFileSync("document.pdf", config);

if (result.Metadata?.Format.Pdf != null)
{
    var pdfMeta = result.Metadata.Format.Pdf;
    Console.WriteLine($"Pages: {pdfMeta.PageCount}");
    Console.WriteLine($"Author: {pdfMeta.Author}");
    Console.WriteLine($"Title: {pdfMeta.Title}");
}

var htmlResult = KreuzbergLib.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
    var htmlMeta = htmlResult.Metadata.Format.Html;
    Console.WriteLine($"Title: {htmlMeta.Title}");
    Console.WriteLine($"Description: {htmlMeta.Description}");

    // Access keywords as array
    if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
    {
        Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
    }

    // Access canonical URL (renamed from canonical)
    if (htmlMeta.CanonicalUrl != null)
    {
        Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
    }

    // Access Open Graph fields from dictionary
    if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
    {
        if (htmlMeta.OpenGraph.ContainsKey("image"))
            Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
        if (htmlMeta.OpenGraph.ContainsKey("title"))
            Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
        if (htmlMeta.OpenGraph.ContainsKey("type"))
            Console.WriteLine($"Open Graph Type: {htmlMeta.OpenGraph["type"]}");
    }

    // Access Twitter Card fields from dictionary
    if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
    {
        if (htmlMeta.TwitterCard.ContainsKey("card"))
            Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
        if (htmlMeta.TwitterCard.ContainsKey("creator"))
            Console.WriteLine($"Twitter Creator: {htmlMeta.TwitterCard["creator"]}");
    }

    // Access new fields
    if (htmlMeta.Language != null)
        Console.WriteLine($"Language: {htmlMeta.Language}");

    if (htmlMeta.TextDirection != null)
        Console.WriteLine($"Text Direction: {htmlMeta.TextDirection}");

    // Access headers
    if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
        Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");

    // Access links
    if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
    {
        foreach (var link in htmlMeta.Links)
            Console.WriteLine($"Link: {link.Href} ({link.Text})");
    }

    // Access images
    if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
        Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");

    // Access structured data
    if (htmlMeta.StructuredData != null && htmlMeta.StructuredData.Count > 0)
        Console.WriteLine($"Structured Data items: {htmlMeta.StructuredData.Count}");
}
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  final result = await KreuzbergBridge.extractFile('document.pdf', null);

  final metadata = result.metadata;

  if (metadata.title != null) {
    print('Title: ${metadata.title}');
  }
  if (metadata.subject != null) {
    print('Subject: ${metadata.subject}');
  }
  if (metadata.authors != null) {
    print('Authors: ${metadata.authors!.join(', ')}');
  }
  if (metadata.keywords != null) {
    print('Keywords: ${metadata.keywords!.join(', ')}');
  }
  if (metadata.language != null) {
    print('Language: ${metadata.language}');
  }
  if (metadata.createdAt != null) {
    print('Created: ${metadata.createdAt}');
  }
  if (metadata.modifiedAt != null) {
    print('Modified: ${metadata.modifiedAt}');
  }
  if (metadata.extractionDurationMs != null) {
    print('Extraction took: ${metadata.extractionDurationMs} ms');
  }

  for (final entry in metadata.additional.entries) {
    print('Additional[${entry.key}]: ${entry.value}');
  }
}
Go
package main

import (
    "fmt"
    "log"
    "strings"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        log.Fatalf("extract pdf: %v", err)
    }

    // Access PDF metadata
    if pdf, ok := result.Metadata.PdfMetadata(); ok {
        if pdf.PageCount != nil {
            fmt.Printf("Pages: %d\n", *pdf.PageCount)
        }
        if pdf.Author != nil {
            fmt.Printf("Author: %s\n", *pdf.Author)
        }
        if pdf.Title != nil {
            fmt.Printf("Title: %s\n", *pdf.Title)
        }
    }

    // Access HTML metadata
    htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
    if err != nil {
        log.Fatalf("extract html: %v", err)
    }
    if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
        if html.Title != nil {
            fmt.Printf("Title: %s\n", *html.Title)
        }
        if html.Description != nil {
            fmt.Printf("Description: %s\n", *html.Description)
        }

        // Access keywords as array
        if len(html.Keywords) > 0 {
            fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
        }

        // Access canonical URL (renamed from canonical)
        if html.CanonicalURL != nil {
            fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
        }

        // Access Open Graph fields from map
        if len(html.OpenGraph) > 0 {
            if image, ok := html.OpenGraph["image"]; ok {
                fmt.Printf("Open Graph Image: %s\n", image)
            }
            if ogTitle, ok := html.OpenGraph["title"]; ok {
                fmt.Printf("Open Graph Title: %s\n", ogTitle)
            }
            if ogType, ok := html.OpenGraph["type"]; ok {
                fmt.Printf("Open Graph Type: %s\n", ogType)
            }
        }

        // Access Twitter Card fields from map
        if len(html.TwitterCard) > 0 {
            if card, ok := html.TwitterCard["card"]; ok {
                fmt.Printf("Twitter Card Type: %s\n", card)
            }
            if creator, ok := html.TwitterCard["creator"]; ok {
                fmt.Printf("Twitter Creator: %s\n", creator)
            }
        }

        // Access new fields
        if html.Language != nil {
            fmt.Printf("Language: %s\n", *html.Language)
        }

        if html.TextDirection != nil {
            fmt.Printf("Text Direction: %s\n", *html.TextDirection)
        }

        // Access headers
        if len(html.Headers) > 0 {
            headers := make([]string, len(html.Headers))
            for i, h := range html.Headers {
                headers[i] = h.Text
            }
            fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
        }

        // Access links
        if len(html.Links) > 0 {
            for _, link := range html.Links {
                fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
            }
        }

        // Access images
        if len(html.Images) > 0 {
            for _, image := range html.Images {
                fmt.Printf("Image: %s\n", image.Src)
            }
        }

        // Access structured data
        if len(html.StructuredData) > 0 {
            fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
        }
    }
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Metadata;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
import java.util.List;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");

            // Metadata is flat — format-specific fields are at the top level
            Metadata metadata = result.getMetadata();
            metadata.getTitle().ifPresent(t -> System.out.println("Title: " + t));
            metadata.getAuthors().ifPresent(a -> System.out.println("Authors: " + String.join(", ", a)));

            // Format-specific fields are in the additional map
            Map<String, Object> extra = metadata.getAdditional();
            if (extra.get("page_count") != null) {
                System.out.println("Pages: " + extra.get("page_count"));
            }

            // Access HTML metadata
            ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
            Metadata htmlMeta = htmlResult.getMetadata();
            htmlMeta.getTitle().ifPresent(t -> System.out.println("Title: " + t));

            Map<String, Object> htmlExtra = htmlMeta.getAdditional();
            String description = (String) htmlExtra.get("description");
            if (description != null) {
                System.out.println("Description: " + description);
            }

            // Access keywords as array
            htmlMeta.getKeywords().ifPresent(keywords ->
                System.out.println("Keywords: " + keywords));

            // Access canonical URL (renamed from canonical)
            String canonicalUrl = (String) htmlExtra.get("canonical_url");
            if (canonicalUrl != null) {
                System.out.println("Canonical URL: " + canonicalUrl);
            }

            // Access Open Graph fields from map
            @SuppressWarnings("unchecked")
            Map<String, String> openGraph = (Map<String, String>) htmlExtra.get("open_graph");
            if (openGraph != null) {
                System.out.println("Open Graph Image: " + openGraph.get("image"));
                System.out.println("Open Graph Title: " + openGraph.get("title"));
                System.out.println("Open Graph Type: " + openGraph.get("type"));
            }

            // Access Twitter Card fields from map
            @SuppressWarnings("unchecked")
            Map<String, String> twitterCard = (Map<String, String>) htmlExtra.get("twitter_card");
            if (twitterCard != null) {
                System.out.println("Twitter Card Type: " + twitterCard.get("card"));
                System.out.println("Twitter Creator: " + twitterCard.get("creator"));
            }

            // Access new fields
            htmlMeta.getLanguage().ifPresent(l -> System.out.println("Language: " + l));

            String textDirection = (String) htmlExtra.get("text_direction");
            if (textDirection != null) {
                System.out.println("Text Direction: " + textDirection);
            }

            // Access headers
            @SuppressWarnings("unchecked")
            List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlExtra.get("headers");
            if (headers != null) {
                headers.stream()
                    .map(h -> h.get("text"))
                    .forEach(text -> System.out.print(text + ", "));
                System.out.println();
            }

            // Access links
            @SuppressWarnings("unchecked")
            List<Map<String, Object>> links = (List<Map<String, Object>>) htmlExtra.get("links");
            if (links != null) {
                for (Map<String, Object> link : links) {
                    System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
                }
            }

            // Access images
            @SuppressWarnings("unchecked")
            List<Map<String, Object>> images = (List<Map<String, Object>>) htmlExtra.get("images");
            if (images != null) {
                for (Map<String, Object> image : images) {
                    System.out.println("Image: " + image.get("src"));
                }
            }

            // Access structured data
            @SuppressWarnings("unchecked")
            List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlExtra.get("structured_data");
            if (structuredData != null) {
                System.out.println("Structured data items: " + structuredData.size());
            }
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}
Kotlin
import dev.kreuzberg.*
import java.nio.file.Paths
import java.util.Optional

fun main() {
    val config = ExtractionConfig.builder().build()
    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)

    val metadata = result.metadata()
    metadata.title()?.let { println("Title: $it") }
    metadata.authors()?.let { println("Authors: ${it.joinToString(", ")}") }

    // Format-specific metadata via discriminated union
    metadata.format()?.pdf()?.let { pdf ->
        pdf.pageCount()?.let { println("Pages: $it") }
        pdf.producer()?.let { println("Producer: $it") }
        pdf.pdfVersion()?.let { println("PDF Version: $it") }
    }

    // Access HTML metadata
    val htmlResult = Kreuzberg.extractFileSync(Paths.get("page.html"), null, config)
    htmlResult.metadata().format()?.html()?.let { html ->
        html.title()?.let { println("Title: $it") }
        html.description()?.let { println("Description: $it") }
        html.canonicalUrl()?.let { println("Canonical URL: $it") }
        html.language()?.let { println("Language: $it") }

        // Access keywords list
        println("Keywords: ${html.keywords()}")

        // Open Graph fields are exposed as a Map<String, String>
        html.openGraph()["image"]?.let { println("Open Graph Image: $it") }
        html.openGraph()["title"]?.let { println("Open Graph Title: $it") }

        // Twitter Card fields as a Map<String, String>
        html.twitterCard()["card"]?.let { println("Twitter Card Type: $it") }

        // Headers
        for (header in html.headers()) {
            println("Header (level ${header.level()}): ${header.text()}")
        }

        // Links
        for (link in html.links()) {
            println("Link: ${link.href()} (${link.text()})")
        }

        // Images
        for (image in html.images()) {
            println("Image: ${image.src()}")
        }

        // Structured data
        if (html.structuredData().isNotEmpty()) {
            println("Structured data items: ${html.structuredData().size}")
        }
    }
}
Python
from kreuzberg import extract_file_sync, ExtractionConfig

result = extract_file_sync("document.pdf", config=ExtractionConfig())

# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata.get("page_count"):
    print(f"Pages: {metadata['page_count']}")
if metadata.get("title"):
    print(f"Title: {metadata['title']}")
if metadata.get("authors"):
    print(f"Authors: {', '.join(metadata['authors'])}")

result = extract_file_sync("page.html", config=ExtractionConfig())
metadata = result.metadata
if metadata.get("title"):
    print(f"Title: {metadata['title']}")
if metadata.get("description"):
    print(f"Description: {metadata['description']}")

# Access keywords as array
keywords = metadata.get('keywords', [])
if keywords:
    print(f"Keywords: {', '.join(keywords)}")

# Access canonical URL (renamed from canonical)
canonical_url = metadata.get('canonical_url')
if canonical_url:
    print(f"Canonical URL: {canonical_url}")

# Access Open Graph fields from map
open_graph = metadata.get('open_graph', {})
if open_graph:
    if 'image' in open_graph:
        print(f"Open Graph Image: {open_graph['image']}")
    if 'title' in open_graph:
        print(f"Open Graph Title: {open_graph['title']}")
    if 'type' in open_graph:
        print(f"Open Graph Type: {open_graph['type']}")

# Access Twitter Card fields from map
twitter_card = metadata.get('twitter_card', {})
if twitter_card:
    if 'card' in twitter_card:
        print(f"Twitter Card Type: {twitter_card['card']}")
    if 'creator' in twitter_card:
        print(f"Twitter Creator: {twitter_card['creator']}")

# Access new fields
language = metadata.get('language')
if language:
    print(f"Language: {language}")

text_direction = metadata.get('text_direction')
if text_direction:
    print(f"Text Direction: {text_direction}")

# Access headers
headers = metadata.get('headers', [])
if headers:
    print(f"Headers: {', '.join([h['text'] for h in headers])}")

# Access links
links = metadata.get('links', [])
if links:
    for link in links:
        print(f"Link: {link.get('href')} ({link.get('text')})")

# Access images
images = metadata.get('images', [])
if images:
    for image in images:
        print(f"Image: {image.get('src')}")

# Access structured data
structured_data = metadata.get('structured_data', [])
if structured_data:
    print(f"Structured data items: {len(structured_data)}")
Ruby
require 'kreuzberg'

result = Kreuzberg.extract_file_sync('document.pdf')

# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata['page_count']
  puts "Pages: #{metadata['page_count']}"
end
if metadata['title']
  puts "Title: #{metadata['title']}"
end
if metadata['authors']
  puts "Authors: #{metadata['authors'].join(', ')}"
end

# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
metadata = html_result.metadata
if metadata['title']
  puts "Title: #{metadata['title']}"
end
if metadata['description']
  puts "Description: #{metadata['description']}"
end

# Access keywords as array
if metadata['keywords']
  puts "Keywords: #{metadata['keywords'].join(', ')}"
end

# Access canonical URL (renamed from canonical)
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']

# Access Open Graph fields from map
open_graph = metadata['open_graph'] || {}
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']

# Access Twitter Card fields from map
twitter_card = metadata['twitter_card'] || {}
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']

# Access new fields
puts "Language: #{metadata['language']}" if metadata['language']
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']

# Access headers
if metadata['headers']
  puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
end

# Access links
if metadata['links']
  metadata['links'].each do |link|
    puts "Link: #{link['href']} (#{link['text']})"
  end
end

# Access images
if metadata['images']
  metadata['images'].each do |image|
    puts "Image: #{image['src']}"
  end
end

# Access structured data
if metadata['structured_data']
  puts "Structured data items: #{metadata['structured_data'].length}"
end
R
library(kreuzberg)

result <- extract_file_sync("document.pdf")

cat("Detected Language:", result$detected_language, "\n")
cat("Quality Score:", result$quality_score, "\n")
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")

cat("Metadata fields:\n")
authors <- metadata_field(result, "authors")
if (!is.null(authors)) {
  cat("Authors:", paste(authors, collapse=", "), "\n")
}

created <- metadata_field(result, "created_date")
if (!is.null(created)) {
  cat("Created Date:", created, "\n")
}

pages_meta <- metadata_field(result, "page_count")
if (!is.null(pages_meta)) {
  cat("Pages:", pages_meta, "\n")
}
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;

    if let Some(pdf_meta) = result.metadata.pdf {
        if let Some(pages) = pdf_meta.page_count {
            println!("Pages: {}", pages);
        }
        if let Some(author) = pdf_meta.author {
            println!("Author: {}", author);
        }
        if let Some(title) = pdf_meta.title {
            println!("Title: {}", title);
        }
    }

    let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
    if let Some(html_meta) = html_result.metadata.html {
        if let Some(title) = html_meta.title {
            println!("Title: {}", title);
        }
        if let Some(desc) = html_meta.description {
            println!("Description: {}", desc);
        }

        // Access keywords array
        println!("Keywords: {:?}", html_meta.keywords);

        // Access canonical URL (renamed from canonical)
        if let Some(canonical) = html_meta.canonical_url {
            println!("Canonical URL: {}", canonical);
        }

        // Access Open Graph fields as a map
        if let Some(og_image) = html_meta.open_graph.get("image") {
            println!("Open Graph Image: {}", og_image);
        }
        if let Some(og_title) = html_meta.open_graph.get("title") {
            println!("Open Graph Title: {}", og_title);
        }

        // Access Twitter Card fields as a map
        if let Some(twitter_card) = html_meta.twitter_card.get("card") {
            println!("Twitter Card Type: {}", twitter_card);
        }

        // Access new fields
        if let Some(lang) = html_meta.language {
            println!("Language: {}", lang);
        }

        // Access headers
        if !html_meta.headers.is_empty() {
            for header in &html_meta.headers {
                println!("Header (level {}): {}", header.level, header.text);
            }
        }

        // Access links
        if !html_meta.links.is_empty() {
            for link in &html_meta.links {
                println!("Link: {} ({})", link.href, link.text);
            }
        }

        // Access images
        if !html_meta.images.is_empty() {
            for image in &html_meta.images {
                println!("Image: {}", image.src);
            }
        }

        // Access structured data
        if !html_meta.structured_data.is_empty() {
            println!("Structured data items: {}", html_meta.structured_data.len());
        }
    }
    Ok(())
}
Swift
import Foundation
import Kreuzberg
import RustBridge

let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)

let metadata = result.metadata()

if let title = metadata.title() {
    print("Title: \(title.toString())")
}
if let subject = metadata.subject() {
    print("Subject: \(subject.toString())")
}
if let language = metadata.language() {
    print("Language: \(language.toString())")
}
if let createdAt = metadata.created_at() {
    print("Created at: \(createdAt.toString())")
}
if let modifiedAt = metadata.modified_at() {
    print("Modified at: \(modifiedAt.toString())")
}
if let createdBy = metadata.created_by() {
    print("Created by: \(createdBy.toString())")
}
if let authors = metadata.authors() {
    let names = authors.map { $0.toString() }
    print("Authors: \(names)")
}
if let keywords = metadata.keywords() {
    let words = keywords.map { $0.toString() }
    print("Keywords: \(words)")
}
if let duration = metadata.extraction_duration_ms() {
    print("Extraction duration (ms): \(duration)")
}
if let pages = metadata.pages() {
    print("Page count: \(pages.total_count())")
}
Elixir
{:ok, result} = Kreuzberg.extract_file("document.pdf")

# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")

# Access PDF metadata directly from the flat map
page_count = metadata["page_count"]
if page_count, do: IO.puts("Page count: #{page_count}")

authors = metadata["authors"] || []
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")

title = metadata["title"]
if title, do: IO.puts("Title: #{title}")

# Access HTML metadata directly from the flat map
{:ok, html_result} = Kreuzberg.extract_file("page.html")
html_meta = html_result.metadata

keywords = html_meta["keywords"] || []
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")

description = html_meta["description"]
if description, do: IO.puts("Description: #{description}")
TypeScript
import { extractFileSync } from "@kreuzberg/node";

const result = extractFileSync("document.pdf");
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.pageCount) {
  console.log(`Pages: ${result.metadata.pageCount}`);
}

const htmlResult = extractFileSync("page.html");
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);

const htmlMeta = htmlResult.metadata;
if (htmlMeta.title) {
  console.log(`Title: ${htmlMeta.title}`);
}

// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
  console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
}

// Access canonical URL (renamed from canonical)
if (htmlMeta.canonicalUrl) {
  console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
}

// Access Open Graph fields from map
if (htmlMeta.openGraph) {
  if (htmlMeta.openGraph["image"]) {
    console.log(`Open Graph Image: ${htmlMeta.openGraph["image"]}`);
  }
  if (htmlMeta.openGraph["title"]) {
    console.log(`Open Graph Title: ${htmlMeta.openGraph["title"]}`);
  }
  if (htmlMeta.openGraph["type"]) {
    console.log(`Open Graph Type: ${htmlMeta.openGraph["type"]}`);
  }
}

// Access Twitter Card fields from map
if (htmlMeta.twitterCard) {
  if (htmlMeta.twitterCard["card"]) {
    console.log(`Twitter Card Type: ${htmlMeta.twitterCard["card"]}`);
  }
  if (htmlMeta.twitterCard["creator"]) {
    console.log(`Twitter Creator: ${htmlMeta.twitterCard["creator"]}`);
  }
}

// Access new fields
if (htmlMeta.language) {
  console.log(`Language: ${htmlMeta.language}`);
}

if (htmlMeta.textDirection) {
  console.log(`Text Direction: ${htmlMeta.textDirection}`);
}

// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
  console.log(`Headers: ${htmlMeta.headers.map((h) => h.text).join(", ")}`);
}

// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
  htmlMeta.links.forEach((link) => {
    console.log(`Link: ${link.href} (${link.text})`);
  });
}

// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
  htmlMeta.images.forEach((image) => {
    console.log(`Image: ${image.src}`);
  });
}

// Access structured data
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
  console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
}
WASM
import { extractFromFile, initWasm } from "@kreuzberg/wasm";

await initWasm();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  const result = await extractFromFile(file);
  console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

  // Access common metadata fields
  if (result.metadata.title) {
    console.log(`Title: ${result.metadata.title}`);
  }

  // Access format-specific metadata
  const metadata = result.metadata;

  // For HTML files
  if (metadata.html) {
    const htmlMeta = metadata.html;
    console.log(`HTML Title: ${htmlMeta.title}`);
    console.log(`Description: ${htmlMeta.description}`);

    // Access keywords as array
    if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
      console.log(`Keywords: ${htmlMeta.keywords.join(", ")}`);
    }

    // Access canonical URL
    if (htmlMeta.canonical_url) {
      console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
    }

    // Access Open Graph fields
    if (htmlMeta.open_graph) {
      if (htmlMeta.open_graph["title"]) {
        console.log(`OG Title: ${htmlMeta.open_graph["title"]}`);
      }
      if (htmlMeta.open_graph["image"]) {
        console.log(`OG Image: ${htmlMeta.open_graph["image"]}`);
      }
    }

    // Access Twitter Card fields
    if (htmlMeta.twitter_card && htmlMeta.twitter_card["card"]) {
      console.log(`Twitter Card Type: ${htmlMeta.twitter_card["card"]}`);
    }

    // Access headers
    if (htmlMeta.headers && htmlMeta.headers.length > 0) {
      console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(", ")}`);
    }

    // Access links
    if (htmlMeta.links && htmlMeta.links.length > 0) {
      htmlMeta.links.forEach((link: any) => {
        console.log(`Link: ${link.href} (${link.text})`);
      });
    }

    // Access images
    if (htmlMeta.images && htmlMeta.images.length > 0) {
      htmlMeta.images.forEach((image: any) => {
        console.log(`Image: ${image.src}`);
      });
    }

    // Access structured data
    if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
      console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
    }
  }

  // PDF-specific fields are at the top level of metadata
  if (metadata.pageCount) {
    console.log(`Pages: ${metadata.pageCount}`);
  }
  if (metadata.authors && metadata.authors.length > 0) {
    console.log(`Authors: ${metadata.authors.join(", ")}`);
  }
}
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const config_json = "{}";
    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
    defer std.heap.c_allocator.free(result_json);

    var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
    defer parsed.deinit();

    const root = parsed.value;
    if (root != .object) return;

    const stdout = std.io.getStdOut().writer();

    if (root.object.get("metadata")) |metadata_val| {
        if (metadata_val != .object) return;
        const metadata = metadata_val.object;

        if (metadata.get("title")) |title_val| {
            if (title_val == .string) {
                try stdout.print("Title: {s}\n", .{title_val.string});
            }
        }

        if (metadata.get("authors")) |authors_val| {
            if (authors_val == .array) {
                for (authors_val.array.items) |author| {
                    if (author == .string) {
                        try stdout.print("Author: {s}\n", .{author.string});
                    }
                }
            }
        }

        if (metadata.get("language")) |language_val| {
            if (language_val == .string) {
                try stdout.print("Language: {s}\n", .{language_val.string});
            }
        }

        if (metadata.get("created_at")) |created_val| {
            if (created_val == .string) {
                try stdout.print("Created: {s}\n", .{created_val.string});
            }
        }

        if (metadata.get("pages")) |pages_val| {
            if (pages_val == .object) {
                if (pages_val.object.get("total_count")) |total_val| {
                    if (total_val == .integer) {
                        try stdout.print("Pages: {d}\n", .{total_val.integer});
                    }
                }
            }
        }
    }
}

Extract and parse metadata using JSON output:

Terminal
# Extract with metadata (JSON format includes metadata automatically)
kreuzberg extract document.pdf --format json

# Save to file and parse metadata
kreuzberg extract document.pdf --format json > result.json

# Print all metadata fields
cat result.json | jq '.metadata'

# Extract HTML metadata
kreuzberg extract page.html --format json | jq '.metadata'

# Get specific fields
kreuzberg extract document.pdf --format json | \
  jq '.metadata | {page_count, authors, title}'

# Process multiple files
kreuzberg batch documents/*.pdf --format json > all_metadata.json

JSON Output Structure:

JSON
{
  "content": "Extracted text...",
  "mime_type": "application/pdf",
  "metadata": {
    "title": "Document Title",
    "authors": ["John Doe"],
    "created_by": "LaTeX with hyperref package",
    "format_type": "pdf",
    "page_count": 10
  },
  "tables": []
}

Kreuzberg extracts format-specific metadata for:

  • PDF: page count, title, authors (list), creation date, modification date
  • HTML: SEO tags, Open Graph, Twitter Card, structured data, headers, links, images
  • Excel: sheet count, sheet names
  • Email: from, to, CC, BCC, message ID, attachments
  • PowerPoint: title, author, description, fonts
  • Images: dimensions, format, EXIF data
  • Archives: format, file count, file list, sizes
  • XML: element count, unique elements
  • Text/Markdown: word count, line count, headers, links

See Types Reference for complete metadata reference.

Extract Tables

Tables come back as both structured cells and Markdown. Kreuzberg extracts them from PDFs, spreadsheets, and HTML:

C
#include "kreuzberg.h"
#include <stdio.h>

int main(void) {
    struct CExtractionResult *result = kreuzberg_extract_file_sync("spreadsheet.xlsx");
    if (!result || !result->success) {
        fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
        return 1;
    }

    if (result->tables_json) {
        printf("Tables (JSON): %s\n", result->tables_json);
    } else {
        printf("No tables found\n");
    }

    kreuzberg_free_result(result);
    return 0;
}
C#
using Kreuzberg;

var result = KreuzbergLib.ExtractFileSync("document.pdf", new ExtractionConfig());

foreach (var table in result.Tables)
{
    Console.WriteLine($"Table with {table.Cells.Count} rows");
    Console.WriteLine(table.Markdown);

    foreach (var row in table.Cells)
    {
        Console.WriteLine(string.Join(" | ", row));
    }
}
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  final result = await KreuzbergBridge.extractFile('document.pdf', null);

  for (final table in result.tables) {
    print('Table on page ${table.pageNumber} with ${table.cells.length} rows');
    print(table.markdown);

    for (final row in table.cells) {
      print(row);
    }

    if (table.boundingBox != null) {
      print('Bounding box: ${table.boundingBox}');
    }
  }
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v5"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    // Iterate over tables
    for _, table := range result.Tables {
        fmt.Printf("Table with %d rows\n", len(table.Cells))
        fmt.Println(table.Markdown) // Markdown representation

        // Access cells
        for _, row := range table.Cells {
            fmt.Println(row)
        }
    }
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;

public class Main {
    public static void main(String[] args) {
        try {
            ExtractionResult result = Kreuzberg.extractFile("document.pdf");

            for (Table table : result.getTables()) {
                System.out.println("Table with " + table.cells().size() + " rows");
                System.out.println(table.markdown());

                for (List<String> row : table.cells()) {
                    System.out.println(row);
                }
            }
        } catch (IOException | KreuzbergException e) {
            System.err.println("Extraction failed: " + e.getMessage());
        }
    }
}
Kotlin
import dev.kreuzberg.*
import java.nio.file.Paths

fun main() {
    val config = ExtractionConfig.builder().build()
    val result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), null, config)

    val tables = result.tables() ?: emptyList()
    for (table in tables) {
        println("Table on page ${table.pageNumber()} with ${table.cells().size} rows")
        println(table.markdown())

        for (row in table.cells()) {
            println(row)
        }
    }
}
Python
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable

result = extract_file_sync("document.pdf", config=ExtractionConfig())

for table in result.tables:
    row_count: int = len(table.cells)
    print(f"Table with {row_count} rows")
    print(table.markdown)
    for row in table.cells:
        print(row)
Ruby
require 'kreuzberg'

result = Kreuzberg.extract_file_sync('document.pdf')

# Iterate over tables
result.tables.each do |table|
  puts "Table with #{table['cells'].length} rows"
  puts table['markdown']  # Markdown representation

  # Access cells
  table['cells'].each do |row|
    puts row
  end
end
R
library(kreuzberg)

result <- extract_file_sync("spreadsheet.xlsx")

cat("Tables extracted:", length(result$tables), "\n\n")

for (i in seq_along(result$tables)) {
  table <- result$tables[[i]]
  cat(sprintf("Table %d:\n", i))
  cat("  Rows:", nrow(table), "\n")
  cat("  Columns:", ncol(table), "\n")
  cat("  Column names:", paste(colnames(table), collapse=", "), "\n")
  cat("\n")

  if (nrow(table) > 0L) {
    cat("  Preview (first 3 rows):\n")
    print(head(table, 3L))
    cat("\n")
  }
}
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;

    for table in &result.tables {
        println!("Table with {} rows", table.cells.len());
        println!("{}", table.markdown);

        for row in &table.cells {
            println!("{:?}", row);
        }
    }
    Ok(())
}
Swift
import Foundation
import Kreuzberg
import RustBridge

let config = try extractionConfigFromJson("{}")
let result = try extractFileSync("document.pdf", nil, config)

let tables = result.tables()
print("Tables: \(tables.count)")

for (index, table) in tables.enumerated() {
    print("Table \(index) on page \(table.page_number())")
    print(table.markdown().toString())

    if let bbox = table.bounding_box() {
        print("  Bounding box: \(bbox.toString())")
    }
}
Elixir
{:ok, result} = Kreuzberg.extract_file("document.pdf")

tables = result.tables
IO.puts("Total tables found: #{length(tables)}")

Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
  IO.puts("\n--- Table #{index} ---")

  # Access table cells
  cells = table["cells"] || []
  IO.puts("Rows: #{length(cells)}")

  # Access table markdown representation
  markdown = table["markdown"]
  IO.puts("Markdown representation:")
  IO.puts(markdown)
end)
TypeScript
import { extractFileSync } from "kreuzberg";

const result = extractFileSync("document.pdf");

result.tables?.forEach((table) => {
  console.log(`Table with ${table.cells?.length ?? 0} rows`);
  console.log(table.markdown);
  table.cells?.forEach((row) => console.log(row.join(" | ")));
});
WASM
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  const bytes = new Uint8Array(await file.arrayBuffer());
  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);

  result.tables?.forEach((table) => {
    console.log(`Table with ${table.cells?.length ?? 0} rows`);
    if (table.markdown) {
      console.log(table.markdown);
    }
    table.cells?.forEach((row) => console.log(row.join(" | ")));
  });
}
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    defer _ = gpa.deinit();
    const allocator = gpa.allocator();

    const config_json = "{}";
    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
    defer std.heap.c_allocator.free(result_json);

    var parsed = try std.json.parseFromSlice(std.json.Value, allocator, result_json, .{});
    defer parsed.deinit();

    const root = parsed.value;
    if (root != .object) return;

    const stdout = std.io.getStdOut().writer();

    const tables_val = root.object.get("tables") orelse return;
    if (tables_val != .array) return;

    for (tables_val.array.items) |table| {
        if (table != .object) continue;

        if (table.object.get("cells")) |cells_val| {
            if (cells_val == .array) {
                try stdout.print("Table with {d} rows\n", .{cells_val.array.items.len});

                for (cells_val.array.items) |row_val| {
                    if (row_val != .array) continue;
                    try stdout.print("  Row:", .{});
                    for (row_val.array.items) |cell_val| {
                        if (cell_val == .string) {
                            try stdout.print(" [{s}]", .{cell_val.string});
                        }
                    }
                    try stdout.print("\n", .{});
                }
            }
        }

        if (table.object.get("markdown")) |markdown_val| {
            if (markdown_val == .string) {
                try stdout.print("{s}\n", .{markdown_val.string});
            }
        }

        if (table.object.get("page_number")) |page_val| {
            if (page_val == .integer) {
                try stdout.print("Page: {d}\n", .{page_val.integer});
            }
        }
    }
}

Extract and process tables from documents:

Terminal
# Extract with JSON format (includes tables when detected)
kreuzberg extract document.pdf --format json

# Save tables to JSON
kreuzberg extract spreadsheet.xlsx --format json > tables.json

# Extract and parse table markdown
kreuzberg extract document.pdf --format json | \
  jq '.tables[]? | .markdown'

# Get table cells
kreuzberg extract document.pdf --format json | \
  jq '.tables[]? | .cells'

# Batch extract tables from multiple files
kreuzberg batch documents/**/*.pdf --format json > all_tables.json

JSON Table Structure:

JSON
{
  "content": "...",
  "tables": [
    {
      "cells": [
        ["Name", "Age", "City"],
        ["Alice", "30", "New York"],
        ["Bob", "25", "Los Angeles"]
      ],
      "markdown": "| Name | Age | City |\n|------|-----|--------|\n| Alice | 30 | New York |\n| Bob | 25 | Los Angeles |"
    }
  ]
}

Going Async

Use async extraction in web servers, background workers, or anywhere you need non-blocking I/O:

C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

/* kreuzberg_extract_file schedules work on the global Tokio runtime and
 * returns once extraction is complete.  For true non-blocking use, call it
 * from a dedicated OS thread and synchronize via a semaphore or callback. */
int main(void) {
    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    KREUZBERGExtractionResult *result =
        kreuzberg_extract_file("document.pdf", NULL, config);
    if (!result) {
        fprintf(stderr, "extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
C#
using Kreuzberg;

var result = await KreuzbergLib.ExtractFileAsync("document.pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
Dart
import 'package:kreuzberg/kreuzberg.dart';

Future<void> main() async {
  final result = await KreuzbergBridge.extractFile('document.pdf', null);

  print(result.content);
  print('MIME type: ${result.mimeType}');
  print('Tables: ${result.tables.length}');
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    result, err := kreuzberg.ExtractFile("document.pdf", nil, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("extraction failed: %v", err)
    }

    println("Content:", result.Content)
    println("MIME type:", result.MimeType)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;

ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFile(Paths.get("document.pdf"), config);

System.out.println(result.content());
System.out.println(result.mimeType());
Kotlin
import dev.kreuzberg.*
import dev.kreuzberg.kt.Kreuzberg
import kotlinx.coroutines.runBlocking
import java.nio.file.Paths

fun main() = runBlocking {
    val config = ExtractionConfig.builder().build()
    val result = Kreuzberg.extractFile(Paths.get("document.pdf"), null, config)

    println(result.content())
    println("MIME type: ${result.mimeType()}")
    println("Tables: ${result.tables()?.size ?: 0}")
}
Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    result = await extract_file("document.pdf", config=ExtractionConfig())
    print(result.content[:200])
    print(f"Tables: {len(result.tables)}")
    print(f"Format: {result.metadata.format_type}")

asyncio.run(main())
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  use_cache: false,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_async('document.pdf', config: config)

puts "Async extraction complete"
puts "Extracted #{result.content.length} characters"
puts "Quality: #{result.quality_score}"
R
library(kreuzberg)

# extract_file is the async variant; extendr drives the tokio runtime so the
# call returns once extraction completes. R has no native async, so wrap with
# the future/promises packages if non-blocking dispatch is required.
json <- extract_file(
  path = "document.pdf",
  mime_type = "application/pdf",
  config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let result = extract_file("document.pdf", None::<&str>, &config).await?;

    println!("{}", result.content);
    println!("MIME type: {}", result.mime_type);
    println!("Tables: {}", result.tables.len());
    Ok(())
}
Swift
import Foundation
import Kreuzberg
import RustBridge

@main
struct App {
    static func main() async throws {
        let config = try extractionConfigFromJson("{}")
        // The Swift binding exposes async-compatible entrypoints; even though
        // the bridge calls are synchronous internally, callers may `await` them
        // to integrate with Swift Concurrency.
        let result = try await extractFile("document.pdf", nil, config)

        print(result.content().toString())
        print("MIME type: \(result.mime_type().toString())")
        print("Tables: \(result.tables().count)")
    }
}
Elixir
task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)

content = result.content
table_count = length(result.tables)
metadata = result.metadata

IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
TypeScript
import { extractFile } from "@kreuzberg/node";

const result = await extractFile("document.pdf");
console.log(result.content);
WASM
import { extractFromFile, initWasm } from "@kreuzberg/wasm";

await initWasm();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];

if (file) {
  const result = await extractFromFile(file);
  const content = result.content;
  const tableCount = result.tables.length;

  console.log(`Content length: ${content.length} characters`);
  console.log(`Tables: ${tableCount}`);
}
Zig
const std = @import("std");
const kreuzberg = @import("kreuzberg");

// Note: the Zig binding is sync-only. There is no `extract_file` async variant —
// the FFI surface exposes blocking entry points that internally drive the global
// Tokio runtime. Use `extract_file_sync` from any thread.
pub fn main() !void {
    const config_json = "{}";
    const result_json = try kreuzberg.extract_file_sync("document.pdf", null, config_json);
    defer std.heap.c_allocator.free(result_json);

    const stdout = std.io.getStdOut().writer();
    try stdout.print("{s}\n", .{result_json});
}

Not Applicable

Async extraction is an API-level feature. The CLI operates synchronously. Use language-specific bindings (Python, TypeScript, Rust, WASM) for async operations.

Next Steps

You've covered the core API. Go deeper:

Edit this page on GitHub