Skip to content

Extraction Basics

Eight core extraction functions are available, organized by input type (file path vs bytes), cardinality (single vs batch), and execution model (sync vs async).

Input Single sync Single async Batch sync Batch async
File path extract_file_sync extract_file batch_extract_files_sync batch_extract_files
Bytes extract_bytes_sync extract_bytes batch_extract_bytes_sync batch_extract_bytes

!!! Tip "Sync vs Async" Use async variants when you're already in an async context or processing multiple files concurrently. For scripts and simple pipelines, sync variants are simpler and just as fast for single files.

Extract from Files

Pass a file path. Kreuzberg detects the MIME type from the extension and selects the right parser automatically.

Synchronous

Python
from kreuzberg import extract_file_sync, ExtractionConfig

result = extract_file_sync("document.pdf", config=ExtractionConfig())

print(result.content[:200])
print(f"Tables: {len(result.tables)}")
print(f"Format: {result.metadata.format_type}")
TypeScript
import { extractFileSync } from "@kreuzberg/node";

const result = extractFileSync("document.pdf");

console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let result = extract_file_sync("document.pdf", None, &config)?;

    println!("{}", result.content);
    println!("MIME type: {}", result.mime_type);
    println!("Tables: {}", result.tables.len());
    Ok(())
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("document.pdf", nil, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("extraction failed: %v", err)
    }

    println("Content:", result.Content)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;

ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("document.pdf"), config);

System.out.println(result.content());
System.out.println("Tables: " + (result.tables() != null ? result.tables().size() : 0));
System.out.println("Metadata: " + result.metadata());
C#
using Kreuzberg;

var result = KreuzbergLib.ExtractFileSync("document.pdf", new ExtractionConfig());

Console.WriteLine(result.Content);
Console.WriteLine($"Tables: {result.Tables.Count}");
Console.WriteLine($"Metadata: {result.Metadata.FormatType}");
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  use_cache: true,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_sync('document.pdf', config: config)

puts "Extracted #{result.content.length} characters"
puts "MIME type: #{result.mime_type}"
puts "Quality score: #{result.quality_score}"
R
library(kreuzberg)

json <- extract_file_sync(
  path = "document.pdf",
  mime_type = "application/pdf",
  config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content length: %d characters\n", nchar(result$content)))
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    KREUZBERGExtractionResult *result =
        kreuzberg_extract_file_sync("document.pdf", NULL, config);
    if (!result) {
        fprintf(stderr, "extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
// WASM exposes only async extraction. Read the file as bytes and call extractBytes.
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
  const bytes = new Uint8Array(await file.arrayBuffer());
  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
  console.log(result.content);
  console.log(`Tables: ${result.tables?.length ?? 0}`);
}

Asynchronous

Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig

async def main() -> None:
    result = await extract_file("document.pdf", config=ExtractionConfig())
    print(result.content[:200])
    print(f"Tables: {len(result.tables)}")
    print(f"Format: {result.metadata.format_type}")

asyncio.run(main())
TypeScript
import { extractFile } from "@kreuzberg/node";

const result = await extractFile("document.pdf");
console.log(result.content);
Rust
use kreuzberg::{extract_file, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let result = extract_file("document.pdf", None::<&str>, &config).await?;

    println!("{}", result.content);
    println!("MIME type: {}", result.mime_type);
    println!("Tables: {}", result.tables.len());
    Ok(())
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    result, err := kreuzberg.ExtractFile("document.pdf", nil, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("extraction failed: %v", err)
    }

    println("Content:", result.Content)
    println("MIME type:", result.MimeType)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;

ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractFile(Paths.get("document.pdf"), config);

System.out.println(result.content());
System.out.println(result.mimeType());
C#
using Kreuzberg;

var result = await KreuzbergLib.ExtractFileAsync("document.pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
Ruby
require 'kreuzberg'

config = Kreuzberg::ExtractionConfig.new(
  use_cache: false,
  enable_quality_processing: true
)

result = Kreuzberg.extract_file_async('document.pdf', config: config)

puts "Async extraction complete"
puts "Extracted #{result.content.length} characters"
puts "Quality: #{result.quality_score}"
R
library(kreuzberg)

# extract_file is the async variant; extendr drives the tokio runtime so the
# call returns once extraction completes. R has no native async, so wrap with
# the future/promises packages if non-blocking dispatch is required.
json <- extract_file(
  path = "document.pdf",
  mime_type = "application/pdf",
  config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("Extracted %d characters from %s\n", nchar(result$content), result$mime_type))
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

/* kreuzberg_extract_file schedules work on the global Tokio runtime and
 * returns once extraction is complete.  For true non-blocking use, call it
 * from a dedicated OS thread and synchronize via a semaphore or callback. */
int main(void) {
    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    KREUZBERGExtractionResult *result =
        kreuzberg_extract_file("document.pdf", NULL, config);
    if (!result) {
        fprintf(stderr, "extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const fileInput = document.getElementById("file") as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
  const bytes = new Uint8Array(await file.arrayBuffer());
  const result = await extractBytes(bytes, file.type || "application/pdf", undefined);
  console.log(`Content length: ${result.content.length} characters`);
  console.log(`Tables: ${result.tables?.length ?? 0}`);
}

Extract from Bytes

When the file is already loaded in memory (for example, from an upload or network response), pass the byte array with its MIME type. Unlike file extraction, the MIME type is required since there's no file extension to infer it from.

Synchronous

Python
from kreuzberg import extract_bytes_sync, ExtractionConfig

with open("document.pdf", "rb") as f:
    content = f.read()

result = extract_bytes_sync(content, "application/pdf", config=ExtractionConfig())

print(result.content[:200])
print(f"Tables: {len(result.tables)}")
TypeScript
import { extractBytesSync } from "@kreuzberg/node";
import { readFileSync } from "fs";

const data = readFileSync("document.pdf");
const result = extractBytesSync(data, "application/pdf");
console.log(result.content);
Rust
use kreuzberg::{extract_bytes_sync, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let content = std::fs::read("document.pdf")?;
    let config = ExtractionConfig::default();
    let result = extract_bytes_sync(&content, "application/pdf", &config)?;

    println!("{}", result.content);
    println!("Tables: {}", result.tables.len());
    Ok(())
}
Go
package main

import (
    "log"
    "os"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    content, err := os.ReadFile("document.pdf")
    if err != nil {
        log.Fatalf("failed to read file: %v", err)
    }

    result, err := kreuzberg.ExtractBytesSync(content, "application/pdf", kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("extraction failed: %v", err)
    }

    println("Content:", result.Content)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Files;
import java.nio.file.Paths;

byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractBytesSync(data, "application/pdf", config);

System.out.println(result.content());
System.out.println(result.mimeType());
C#
using Kreuzberg;

var data = await File.ReadAllBytesAsync("document.pdf");
var result = KreuzbergLib.ExtractBytesSync(data, "application/pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
Ruby
require 'kreuzberg'

pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new(
  use_cache: true
)

result = Kreuzberg.extract_bytes_sync(
  pdf_bytes,
  'application/pdf',
  config: config
)

puts "Extracted #{result.content.length} characters"
puts "Detected MIME: #{result.mime_type}"
R
library(kreuzberg)

path <- "document.pdf"
content <- readBin(path, what = "raw", n = file.info(path)$size)

json <- extract_bytes_sync(
  content = content,
  mime_type = "application/pdf",
  config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("MIME type: %s\n", result$mime_type))
cat(sprintf("Content preview: %s\n", substr(result$content, 1, 200)))
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int main(void) {
    const char *text = "Hello, kreuzberg!";
    const uint8_t *bytes = (const uint8_t *)text;
    size_t len = strlen(text);

    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    KREUZBERGExtractionResult *result =
        kreuzberg_extract_bytes_sync(bytes, len, "text/plain", config);
    if (!result) {
        fprintf(stderr, "extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
// WASM exposes only async extractBytes; await it from any async context.
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());

const result = await extractBytes(data, "application/pdf", undefined);
console.log(result.content);

Asynchronous

Python
import asyncio
from kreuzberg import extract_bytes, ExtractionConfig

async def main() -> None:
    with open("document.pdf", "rb") as f:
        content = f.read()

    result = await extract_bytes(content, "application/pdf", config=ExtractionConfig())
    print(result.content[:200])
    print(f"Tables: {len(result.tables)}")

asyncio.run(main())
TypeScript
import { extractBytes } from "@kreuzberg/node";
import { readFile } from "fs/promises";

const data = await readFile("document.pdf");
const result = await extractBytes(data, "application/pdf");
console.log(result.content);
Rust
use kreuzberg::{extract_bytes, ExtractionConfig};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let content = tokio::fs::read("document.pdf").await?;
    let config = ExtractionConfig::default();
    let result = extract_bytes(&content, "application/pdf", &config).await?;

    println!("{}", result.content);
    println!("Tables: {}", result.tables.len());
    Ok(())
}
Go
package main

import (
    "log"
    "os"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    content, err := os.ReadFile("document.pdf")
    if err != nil {
        log.Fatalf("failed to read file: %v", err)
    }

    result, err := kreuzberg.ExtractBytes(content, "application/pdf", kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("extraction failed: %v", err)
    }

    println("Content:", result.Content)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Files;
import java.nio.file.Paths;

byte[] data = Files.readAllBytes(Paths.get("document.pdf"));
ExtractionConfig config = ExtractionConfig.builder().build();
ExtractionResult result = Kreuzberg.extractBytes(data, "application/pdf", config);

System.out.println(result.content());
System.out.println(result.mimeType());
C#
using Kreuzberg;

var data = await File.ReadAllBytesAsync("document.pdf");
var result = await KreuzbergLib.ExtractBytesAsync(data, "application/pdf");

Console.WriteLine(result.Content);
Console.WriteLine(result.MimeType);
Ruby
require 'kreuzberg'

pdf_bytes = File.read('document.pdf')
config = Kreuzberg::ExtractionConfig.new(
  enable_quality_processing: true
)

result = Kreuzberg.extract_bytes_async(
  pdf_bytes,
  'application/pdf',
  config: config
)

puts "Async bytes extraction done"
puts "Content preview: #{result.content[0..100]}"
puts "Quality score: #{result.quality_score}"
R
library(kreuzberg)

# extract_bytes is the async variant; the call blocks the calling R thread
# until the underlying tokio task completes. Use future/promises if you need
# to fan out without blocking.
path <- "document.pdf"
content <- readBin(path, what = "raw", n = file.info(path)$size)

json <- extract_bytes(
  content = content,
  mime_type = "application/pdf",
  config = ExtractionConfig$default()
)
result <- jsonlite::fromJSON(json, simplifyVector = FALSE)

cat(sprintf("Extracted %d characters\n", nchar(result$content)))
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/* kreuzberg_extract_bytes schedules work on the global Tokio runtime and
 * returns once extraction is complete.  For true non-blocking use, call it
 * from a dedicated OS thread and synchronize via a semaphore or callback. */
int main(void) {
    const char *text = "Hello, kreuzberg!";
    const uint8_t *bytes = (const uint8_t *)text;
    size_t len = strlen(text);

    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    KREUZBERGExtractionResult *result =
        kreuzberg_extract_bytes(bytes, len, "text/plain", config);
    if (!result) {
        fprintf(stderr, "extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());

const result = await extractBytes(data, "application/pdf", undefined);
console.log(`Extracted: ${result.content.length} characters`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);

Batch Processing

Batch functions accept an array of file paths (or byte arrays) and process them concurrently. This is typically 2-5x faster than looping over single-file functions because Kreuzberg parallelizes internally.

Batch Extract Files

Python
from kreuzberg import batch_extract_files_sync, BatchFileItem, ExtractionConfig

items = [
    BatchFileItem(path="doc1.pdf"),
    BatchFileItem(path="doc2.docx"),
    BatchFileItem(path="doc3.html"),
]

results = batch_extract_files_sync(items, ExtractionConfig())

for i, result in enumerate(results):
    print(f"Document {i}: {len(result.content)} chars, {len(result.tables)} tables")
TypeScript
import { batchExtractFilesSync } from "@kreuzberg/node";

const files = ["doc1.pdf", "doc2.docx", "doc3.pptx"];
const results = batchExtractFilesSync(files);

results.forEach((result, i) => {
  console.log(`File ${i + 1}: ${result.content.length} characters`);
});
Rust
use kreuzberg::{batch_extract_files_sync, BatchFileItem, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let items = vec![
        BatchFileItem { path: "doc1.pdf".into(), config: None },
        BatchFileItem { path: "doc2.docx".into(), config: None },
        BatchFileItem { path: "report.pdf".into(), config: None },
    ];
    let results = batch_extract_files_sync(items, &config)?;

    for (i, result) in results.iter().enumerate() {
        println!("File {}: {} chars", i, result.content.len());
    }
    Ok(())
}
Go
package main

import (
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    items := []kreuzberg.BatchFileItem{
        {Path: "doc1.pdf"},
        {Path: "doc2.docx"},
        {Path: "doc3.pptx"},
    }

    results, err := kreuzberg.BatchExtractFilesSync(items, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("batch extraction failed: %v", err)
    }

    for i, result := range results {
        println("Doc", i, "content length:", len(result.Content))
    }
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BatchFileItem;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Paths;
import java.util.List;
import java.util.Arrays;

List<BatchFileItem> items = Arrays.asList(
    new BatchFileItem(Paths.get("doc1.pdf"), null),
    new BatchFileItem(Paths.get("doc2.docx"), null),
    new BatchFileItem(Paths.get("doc3.pptx"), null)
);

ExtractionConfig config = ExtractionConfig.builder().build();
List<ExtractionResult> results = Kreuzberg.batchExtractFilesSync(items, config);

for (ExtractionResult result : results) {
    System.out.println("Content length: " + result.content().length());
}
C#
using Kreuzberg;

var files = new[] { "doc1.pdf", "doc2.docx", "doc3.pptx" };
var results = KreuzbergLib.BatchExtractFilesSync(files, new ExtractionConfig());

foreach (var result in results)
{
    Console.WriteLine($"Content length: {result.Content.Length}");
}
Ruby
require 'kreuzberg'

items = [
  Kreuzberg::BatchFileItem.new(path: 'doc1.pdf'),
  Kreuzberg::BatchFileItem.new(path: 'doc2.docx'),
  Kreuzberg::BatchFileItem.new(path: 'doc3.pptx')
]

config = Kreuzberg::ExtractionConfig.new(use_cache: true)

results = Kreuzberg.batch_extract_files_sync(items, config: config)

results.each_with_index do |result, idx|
  puts "Document #{idx + 1}:"
  puts "  Extracted: #{result.content.length} characters"
  puts "  Quality: #{result.quality_score}"
  puts "  MIME: #{result.mime_type}"
end
R
library(kreuzberg)

items <- jsonlite::toJSON(list(
  list(path = "report.pdf"),
  list(path = "slides.pptx"),
  list(path = "data.xlsx")
), auto_unbox = TRUE)

json <- batch_extract_files_sync(items = items, config = ExtractionConfig$default())
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)

for (i in seq_along(results)) {
  cat(sprintf("[%d] mime=%s chars=%d\n",
              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
}
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    /* Items is a JSON array of BatchFileItem objects.
     * Each entry has a "path" field and an optional "config" override. */
    const char *items_json =
        "["
        "  {\"path\": \"doc1.pdf\"},"
        "  {\"path\": \"doc2.docx\"},"
        "  {\"path\": \"scan.png\", \"config\": {\"force_ocr\": true}}"
        "]";

    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    /* Returns a JSON array of ExtractionResult objects, or NULL on failure. */
    char *results_json =
        kreuzberg_batch_extract_files_sync(items_json, config);
    if (!results_json) {
        fprintf(stderr, "batch extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    printf("%s\n", results_json);
    kreuzberg_free_string(results_json);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
// WASM has no batch helper; await extractBytes for each file (in parallel via Promise.all).
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const input = document.getElementById("files") as HTMLInputElement;
const files = Array.from(input.files ?? []);

const results = await Promise.all(
  files.map(async (file) => {
    const bytes = new Uint8Array(await file.arrayBuffer());
    return extractBytes(bytes, file.type || "application/pdf", undefined);
  }),
);

results.forEach((result, i) => {
  console.log(`File ${i + 1}: ${result.content.length} characters`);
});

Batch Extract Bytes

Python
from kreuzberg import batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig

items = [
    BatchBytesItem(content=b"PDF content", mime_type="application/pdf"),
    BatchBytesItem(content=b"<html>...</html>", mime_type="text/html"),
]

results = batch_extract_bytes_sync(items, ExtractionConfig())

for i, result in enumerate(results):
    print(f"Item {i}: {len(result.content)} chars extracted")
TypeScript
import { batchExtractBytesSync } from "@kreuzberg/node";
import { readFileSync } from "fs";

const files = ["doc1.pdf", "doc2.docx"];
const dataList = files.map((f) => readFileSync(f));
const mimeTypes = [
  "application/pdf",
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
];

const results = batchExtractBytesSync(dataList, mimeTypes);

results.forEach((result, i) => {
  console.log(`Document ${i + 1}: ${result.content.length} characters`);
});
Rust
use kreuzberg::{batch_extract_bytes_sync, BatchBytesItem, ExtractionConfig};

fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig::default();
    let items = vec![
        BatchBytesItem {
            content: b"Hello, world!".to_vec(),
            mime_type: "text/plain".to_string(),
            config: None,
        },
        BatchBytesItem {
            content: b"# Heading\n\nParagraph text.".to_vec(),
            mime_type: "text/markdown".to_string(),
            config: None,
        },
    ];
    let results = batch_extract_bytes_sync(items, &config)?;

    for (i, result) in results.iter().enumerate() {
        println!("Item {}: {} chars", i, result.content.len());
    }
    Ok(())
}
Go
package main

import (
    "log"
    "os"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    doc1, _ := os.ReadFile("doc1.pdf")
    doc2, _ := os.ReadFile("doc2.docx")

    items := []kreuzberg.BatchBytesItem{
        {Content: doc1, MimeType: "application/pdf"},
        {Content: doc2, MimeType: "application/vnd.openxmlformats-officedocument.wordprocessingml.document"},
    }

    results, err := kreuzberg.BatchExtractBytesSync(items, kreuzberg.ExtractionConfig{})
    if err != nil {
        log.Fatalf("batch extraction failed: %v", err)
    }

    println("Processed", len(results), "documents")
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.BatchBytesItem;
import dev.kreuzberg.ExtractionConfig;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
import java.util.Arrays;

byte[] doc1 = Files.readAllBytes(Paths.get("doc1.pdf"));
byte[] doc2 = Files.readAllBytes(Paths.get("doc2.docx"));

List<BatchBytesItem> items = Arrays.asList(
    new BatchBytesItem(doc1, "application/pdf", null),
    new BatchBytesItem(doc2, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", null)
);

ExtractionConfig config = ExtractionConfig.builder().build();
List<ExtractionResult> results = Kreuzberg.batchExtractBytesSync(items, config);
System.out.println("Processed " + results.size() + " documents");
C#
using Kreuzberg;

var documents = new[]
{
    new BytesWithMime(await File.ReadAllBytesAsync("doc1.pdf"), "application/pdf"),
    new BytesWithMime(await File.ReadAllBytesAsync("doc2.docx"), "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
};

var results = KreuzbergLib.BatchExtractBytesSync(documents, new ExtractionConfig());

Console.WriteLine($"Processed {results.Count} documents");
Ruby
require 'kreuzberg'

items = [
  Kreuzberg::BatchBytesItem.new(
    content: File.read('doc1.pdf'),
    mime_type: 'application/pdf'
  ),
  Kreuzberg::BatchBytesItem.new(
    content: File.read('doc2.docx'),
    mime_type: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
  ),
  Kreuzberg::BatchBytesItem.new(
    content: File.read('doc3.xlsx'),
    mime_type: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
  )
]

config = Kreuzberg::ExtractionConfig.new(use_cache: true)

results = Kreuzberg.batch_extract_bytes_sync(items, config: config)

results.each { |result| puts "Extracted: #{result.content.length} chars" }
R
library(kreuzberg)

paths <- c("report.pdf", "notes.txt")
mimes <- c("application/pdf", "text/plain")

items <- jsonlite::toJSON(lapply(seq_along(paths), function(i) {
  bytes <- readBin(paths[i], what = "raw", n = file.info(paths[i])$size)
  list(content = as.integer(bytes), mime_type = mimes[i])
}), auto_unbox = TRUE)

json <- batch_extract_bytes_sync(items = items, config = ExtractionConfig$default())
results <- jsonlite::fromJSON(json, simplifyVector = FALSE)

for (i in seq_along(results)) {
  cat(sprintf("[%d] mime=%s chars=%d\n",
              i, results[[i]]$mime_type, nchar(results[[i]]$content)))
}
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    /* Items is a JSON array of BatchBytesItem objects.
     * Each entry has "content" (array of byte integers), "mime_type", and an optional "config". */
    const char *items_json =
        "["
        "  {\"content\": [72,101,108,108,111,33], \"mime_type\": \"text/plain\"},"
        "  {\"content\": [87,111,114,108,100,33], \"mime_type\": \"text/plain\"}"
        "]";

    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    /* Returns a JSON array of ExtractionResult objects, or NULL on failure. */
    char *results_json =
        kreuzberg_batch_extract_bytes_sync(items_json, config);
    if (!results_json) {
        fprintf(stderr, "batch extraction failed (code %d): %s\n",
                kreuzberg_last_error_code(),
                kreuzberg_last_error_context());
        kreuzberg_extraction_config_free(config);
        return 1;
    }

    printf("%s\n", results_json);
    kreuzberg_free_string(results_json);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
// WASM has no batch helper; await extractBytes for each input (in parallel via Promise.all).
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const urls = ["document1.pdf", "document2.pdf"];

const results = await Promise.all(
  urls.map(async (url) => {
    const resp = await fetch(url);
    const bytes = new Uint8Array(await resp.arrayBuffer());
    return extractBytes(bytes, "application/pdf", undefined);
  }),
);

results.forEach((result, i) => {
  console.log(`Document ${i + 1}: ${result.content.length} characters`);
});

Per-File Configuration v4.5.0

When a batch contains a mix of document types that need different settings (for example, scanned images needing OCR alongside text-based PDFs), use FileExtractionConfig to override options per file while sharing a common batch config.

mixed_batch.py
from kreuzberg import (
    batch_extract_files_sync,
    ExtractionConfig,
    FileExtractionConfig,
    OcrConfig,
)

config = ExtractionConfig(output_format="markdown")

paths = ["report.pdf", "scan.tiff", "notes.html"]
file_configs = [
    None,
    FileExtractionConfig(
        force_ocr=True,
        ocr=OcrConfig(backend="tesseract", language="deu"),
    ),
    FileExtractionConfig(output_format="plain"),
]

results = batch_extract_files_sync(paths, config, file_configs=file_configs)
mixed_batch.ts
import { batchExtractFilesSync } from '@kreuzberg/node';

const results = batchExtractFilesSync(
  ['report.pdf', 'scan.tiff', 'notes.html'],
  { outputFormat: 'markdown' },
  [
    null,
    { forceOcr: true, ocr: { backend: 'tesseract', language: 'deu' } },
    { outputFormat: 'plain' },
  ],
);
mixed_batch.rs
use kreuzberg::{
    batch_extract_files, ExtractionConfig, FileExtractionConfig,
    OcrConfig, OutputFormat,
};
use std::path::PathBuf;

let config = ExtractionConfig {
    output_format: OutputFormat::Markdown,
    ..Default::default()
};

let paths = vec![
    PathBuf::from("report.pdf"),
    PathBuf::from("scan.tiff"),
    PathBuf::from("notes.html"),
];

let file_configs = vec![
    None,
    Some(FileExtractionConfig {
        force_ocr: Some(true),
        ocr: Some(OcrConfig {
            backend: "tesseract".to_string(),
            language: "deu".to_string(),
            ..Default::default()
        }),
        ..Default::default()
    }),
    Some(FileExtractionConfig {
        output_format: Some(OutputFormat::Plain),
        ..Default::default()
    }),
];

let results = batch_extract_files(paths, &config, Some(&file_configs)).await?;

Fields set to None in FileExtractionConfig inherit the batch default. Batch-level concerns like max_concurrent_extractions, use_cache, and security_limits cannot be overridden per file. See the Configuration Reference for the full list of overridable fields.

Content Filtering v4.8.0

Kreuzberg strips running headers, footers, watermarks, and cross-page repeating text by default so that downstream RAG and LLM pipelines see clean body content. ContentFilterConfig lets you opt back in to any of these when you need them, for example when extracting legal forms where the header carries the case number, or when running text analysis on a PDF whose brand name was being incorrectly removed by the repeating-text heuristic.

By default headers, footers, and watermarks are stripped and cross-page repeating text is deduplicated; see ContentFilterConfig for field-level defaults and per-format behavior.

keep_headers_footers.py
from kreuzberg import (
    extract_file_sync,
    ContentFilterConfig,
    ExtractionConfig,
)

# Legal/forms work: keep header and footer text
config = ExtractionConfig(
    content_filter=ContentFilterConfig(
        include_headers=True,
        include_footers=True,
    ),
)

result = extract_file_sync("contract.pdf", config=config)
disable_repeating_text.ts
import { extract } from "@kreuzberg/node";

// Disable cross-page deduplication so brand names aren't stripped
const result = await extract("brochure.pdf", {
  contentFilter: {
    stripRepeatingText: false,
  },
});
content_filter.rs
use kreuzberg::{extract_file_sync, ContentFilterConfig, ExtractionConfig};

let config = ExtractionConfig {
    content_filter: Some(ContentFilterConfig {
        include_headers: true,
        include_footers: true,
        strip_repeating_text: true,
        include_watermarks: false,
    }),
    ..Default::default()
};

let result = extract_file_sync("contract.pdf", None, &config)?;

When a layout-detection model is active, it can independently classify regions as page headers or footers and strip them per page. Setting include_headers=True / include_footers=True also disables that per-page stripping. See the reference page for the full field semantics and per-format behavior.

Supported Formats

Kreuzberg supports 90+ file formats across 8 categories:

Category Extensions Notes
PDF .pdf Native text + OCR for scanned pages
Images .png, .jpg, .jpeg, .tiff, .bmp, .webp Requires OCR backend
Office .docx, .pptx, .xlsx Modern formats via native parsers
Legacy Office .doc, .ppt Native OLE/CFB parsing
Email .eml, .msg Full support including attachments
Web .html, .htm Converted to Markdown with metadata
Text .md, .txt, .xml, .json, .yaml, .toml, .csv Direct extraction
Archives .zip, .tar, .tar.gz, .tar.bz2 Recursive extraction

Page Tracking

Kreuzberg can track page boundaries and extract per-page content. Page tracking availability depends on the format:

  • PDF — Full byte-accurate page tracking with O(1) lookup
  • PPTX — Slide boundary tracking (each slide = one page)
  • DOCX — Best-effort detection using explicit <w:br type="page"/> tags
  • Other formats — No page tracking

Enable page extraction with PageConfig:

page_tracking.py
config = ExtractionConfig(
    pages=PageConfig(
        insert_page_markers=True,
        marker_format="\n\n<!-- PAGE {page_num} -->\n\n"
    )
)

Page markers like <!-- PAGE 1 --> are inserted at boundaries in the content field — useful for LLMs that need to understand document layout. When both page tracking and chunking are enabled, chunks automatically include first_page and last_page metadata.

See PageConfig Reference for all options and Advanced Page Tracking for chunk-to-page mapping examples.

Code File Extraction

Source code files (.py, .rs, .ts, .go, etc.) go through tree-sitter and produce a ProcessResult on ExtractionResult.code_intelligence (structure, imports/exports, symbols, docstrings, diagnostics, semantic chunks). Code files bypass text chunking — TSLP's function/class-aware CodeChunks map directly to Kreuzberg Chunks with semantic chunk_type and heading context.

See Code Intelligence for usage and TreeSitterProcessConfig for fields.

PDF Page Rendering

Render individual PDF pages as PNG images. Unlike the extraction pipeline (which parses text, tables, metadata), this API produces raw pixel data for thumbnails, vision model input, or custom OCR pipelines.

Two Approaches

API When to use
render_pdf_page You know which page you need, or only need a few pages
PdfPageIterator Process every page sequentially without loading all images into memory

DPI Configuration

DPI Pixel size (US Letter) Use case
72 612 x 792 Thumbnails, quick previews
150 (default) 1275 x 1650 General-purpose, screen display
300 2550 x 3300 OCR input, print quality

Tip: Use 300 DPI when rendering pages for OCR or vision models. The default 150 DPI may reduce recognition accuracy on small text.

MIME Type Detection

When extracting from bytes, Kreuzberg requires an explicit MIME type since there's no file extension to infer it from. For file paths, auto-detection from the extension is automatic.

Example: Override MIME Type

Python
from kreuzberg import extract_file

# File without extension — provide MIME type explicitly
result = extract_file("document_copy", mime_type="application/pdf", config=config)

Error Handling

All extraction functions raise typed exceptions on failure. Catch specific exceptions to handle different failure modes:

Python
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
    KreuzbergError,
    ParsingError,
    OCRError,
    ValidationError,
)

try:
    result = extract_file_sync("document.pdf")
    print(f"Extracted {len(result.content)} characters")
except FileNotFoundError as e:
    print(f"File not found: {e}")
except ParsingError as e:
    print(f"Failed to parse document: {e}")
except OCRError as e:
    print(f"OCR processing failed: {e}")
except KreuzbergError as e:
    print(f"Extraction error: {e}")

try:
    config: ExtractionConfig = ExtractionConfig()
    pdf_bytes: bytes = b"%PDF-1.4\n"
    result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
    print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
    print(f"Invalid configuration: {e}")
except OCRError as e:
    print(f"OCR failed: {e}")
except KreuzbergError as e:
    print(f"Extraction failed: {e}")
TypeScript
import { extractFileSync } from "kreuzberg";

try {
  const result = extractFileSync("missing.pdf");
  console.log(result.content);
} catch (error: unknown) {
  if (error instanceof Error) {
    console.error(`Extraction failed: ${error.message}`);
  }
  throw error;
}
Rust
use kreuzberg::{extract_file_sync, ExtractionConfig, KreuzbergError};

fn main() {
    let config = ExtractionConfig::default();
    match extract_file_sync("document.pdf", None, &config) {
        Ok(result) => println!("{}", result.content),
        Err(KreuzbergError::Io(e)) => eprintln!("File error: {e}"),
        Err(KreuzbergError::UnsupportedFormat(mime)) => {
            eprintln!("Unsupported format: {mime}");
        }
        Err(KreuzbergError::Parsing { message, .. }) => {
            eprintln!("Corrupt or invalid document: {message}");
        }
        Err(KreuzbergError::MissingDependency(dep)) => {
            eprintln!("Missing dependency — install {dep}");
        }
        Err(e) => eprintln!("Extraction failed: {e}"),
    }
}
Go
package main

import (
    "errors"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/v5"
)

func main() {
    result, err := kreuzberg.ExtractFileSync("missing.pdf", nil, kreuzberg.ExtractionConfig{})
    if err != nil {
        if errors.Is(err, kreuzberg.ErrIo) {
            log.Printf("file not found: %v", err)
        } else if errors.Is(err, kreuzberg.ErrUnsupportedFormat) {
            log.Printf("unsupported format: %v", err)
        } else {
            log.Printf("extraction error: %v", err)
        }
        return
    }

    println("Content:", result.Content)
}
Java
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.ExtractionConfig;
import dev.kreuzberg.KreuzbergRsException;
import java.nio.file.Paths;

try {
    ExtractionConfig config = ExtractionConfig.builder().build();
    ExtractionResult result = Kreuzberg.extractFileSync(Paths.get("missing.pdf"), config);
    System.out.println(result.content());
} catch (KreuzbergRsException e) {
    System.err.println("Extraction failed: " + e.getMessage());
    System.err.println("Error code: " + e.getCode());
}
C#
using Kreuzberg;

try
{
    var result = KreuzbergLib.ExtractFileSync("missing.pdf");
    Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
    Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
    Console.Error.WriteLine($"IO error: {ex.Message}");
    throw;
}
catch (KreuzbergException ex)
{
    Console.Error.WriteLine($"Extraction failed: {ex.Message}");
    throw;
}
Ruby
require 'kreuzberg'

begin
  result = Kreuzberg.extract_file_sync('missing.pdf')
  puts result.content
rescue RuntimeError => e
  # All extraction errors are raised as RuntimeError
  # Check error message for specific error details
  case e.message
  when /validation/i
    puts "Validation error: #{e.message}"
  when /io|not found/i
    puts "IO error: #{e.message}"
    raise
  else
    puts "Extraction failed: #{e.message}"
    raise
  end
end
R
library(kreuzberg)

content <- charToRaw("Hello, world!")

result <- tryCatch(
  {
    json <- extract_bytes_sync(
      content = content,
      mime_type = "application/x-nonexistent",
      config = ExtractionConfig$default()
    )
    jsonlite::fromJSON(json, simplifyVector = FALSE)
  },
  error = function(e) {
    message(sprintf("Extraction failed: %s", conditionMessage(e)))
    NULL
  }
)

if (is.null(result)) {
  cat("No content extracted; falling back to original bytes\n")
} else {
  cat(sprintf("Extracted %d characters\n", nchar(result$content)))
}
C
#include "kreuzberg.h"
#include <stdio.h>
#include <stdlib.h>

int main(void) {
    KREUZBERGExtractionConfig *config = kreuzberg_extraction_config_default();

    /* Pass an unsupported MIME type to trigger an error. */
    KREUZBERGExtractionResult *result =
        kreuzberg_extract_bytes_sync(NULL, 0, "application/x-unknown", config);
    if (!result) {
        int32_t code = kreuzberg_last_error_code();
        const char *message = kreuzberg_last_error_context();
        /* message is valid until the next FFI call on this thread — copy if needed. */
        fprintf(stderr, "error %d: %s\n", code, message ? message : "(no message)");
        kreuzberg_extraction_config_free(config);
        return code != 0 ? code : 1;
    }

    char *content = kreuzberg_extraction_result_content(result);
    printf("%s\n", content ? content : "(empty)");
    kreuzberg_free_string(content);

    kreuzberg_extraction_result_free(result);
    kreuzberg_extraction_config_free(config);
    return 0;
}
WASM
import init, { extractBytes } from "kreuzberg-wasm";

await init();

const response = await fetch("document.pdf");
const data = new Uint8Array(await response.arrayBuffer());

try {
  const result = await extractBytes(data, "application/pdf", undefined);
  console.log(`Success: ${result.content.length} characters`);
} catch (error) {
  if (error instanceof Error) {
    console.error("Extraction error:", error.message);
  }
}

System Errors

OSError (Python), IOException (Rust), and system-level errors always propagate through. These indicate real system problems (permissions, disk space, etc.) that your application should handle.

Next Steps

Edit this page on GitHub