Quick Start¶
This guide walks you through Kreuzberg's core API — extracting text, handling errors, running OCR, and working with metadata. Install your binding first if you haven't: Installation.
Node.js or Browser?
Kreuzberg provides two TypeScript packages for different runtimes:
@kreuzberg/node– Use for Node.js servers and CLI tools (native performance, 100% speed)@kreuzberg/wasm– Use for browsers, Cloudflare Workers, Deno, Bun, and serverless (60-80% speed, cross-platform)
The examples below show both. Pick the one matching your runtime. See Platform Overview for detailed guidance.
Your First Extraction¶
Pass a file path to get its text content. Kreuzberg detects the format automatically:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct CExtractionResult *result = kreuzberg_extract_file_sync("document.pdf");
if (!result || !result->success) {
struct CErrorDetails err = kreuzberg_get_error_details();
fprintf(stderr, "Error: %s\n", err.message);
return 1;
}
printf("%s\n", result->content);
printf("MIME type: %s\n", result->mime_type);
kreuzberg_free_result(result);
return 0;
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
fmt.Printf("Tables: %d\n", len(result.Tables))
fmt.Printf("Metadata: %+v\n", result.Metadata)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
System.out.println("Metadata: " + result.getMetadata());
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
from kreuzberg import extract_file_sync, ExtractionConfig
config: ExtractionConfig = ExtractionConfig()
result = extract_file_sync("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
metadata: dict = result.metadata
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
print(f"Metadata keys: {list(metadata.keys())}")
library(kreuzberg)
# Extract a file synchronously
result <- extract_file_sync("path/to/document.pdf")
# Access extraction results
cat("Content length:", nchar(result$content), "\n")
cat("Mime type:", result$mime_type, "\n")
cat("Pages:", page_count(result), "\n")
cat("Quality score:", result$quality_score, "\n")
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
println!("{}", result.content);
println!("Tables: {}", result.tables.len());
println!("Metadata: {:?}", result.metadata);
Ok(())
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(result.content);
console.log(`Tables: ${result.tables.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
}
Handle Errors¶
Wrap extractions in error handling before going further. Kreuzberg raises specific exceptions for missing files, parse failures, and OCR problems:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct CExtractionResult *result = kreuzberg_extract_file_sync("missing.pdf");
if (!result || !result->success) {
struct CErrorDetails err = kreuzberg_get_error_details();
fprintf(stderr, "Error [%s]: %s\n",
kreuzberg_error_code_name(err.error_code),
err.message);
if (err.error_code == kreuzberg_error_code_io()) {
fprintf(stderr, "File not found or unreadable\n");
} else if (err.error_code == kreuzberg_error_code_unsupported_format()) {
fprintf(stderr, "Unsupported file format\n");
}
if (result) kreuzberg_free_result(result);
return 1;
}
printf("%s\n", result->content);
kreuzberg_free_result(result);
return 0;
}
using Kreuzberg;
try
{
var result = KreuzbergClient.ExtractFileSync("missing.pdf");
Console.WriteLine(result.Content);
}
catch (KreuzbergValidationException ex)
{
Console.Error.WriteLine($"Validation error: {ex.Message}");
}
catch (KreuzbergIOException ex)
{
Console.Error.WriteLine($"IO error: {ex.Message}");
throw;
}
catch (KreuzbergException ex)
{
Console.Error.WriteLine($"Extraction failed: {ex.Message}");
throw;
}
package main
import (
"errors"
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
switch {
case errors.As(err, new(*kreuzberg.ValidationError)):
log.Fatalf("invalid configuration: %v", err)
case errors.As(err, new(*kreuzberg.ParsingError)):
log.Fatalf("failed to parse document: %v", err)
case errors.As(err, new(*kreuzberg.OCRError)):
log.Fatalf("OCR processing failed: %v", err)
case errors.As(err, new(*kreuzberg.MissingDependencyError)):
log.Fatalf("missing dependency: %v", err)
default:
log.Fatalf("extraction error: %v", err)
}
}
fmt.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
System.out.println("Extracted: " + result.getContent()
.substring(0, Math.min(100, result.getContent().length())));
} catch (IOException e) {
System.err.println("File not found: " + e.getMessage());
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
try {
byte[] pdfBytes = new byte[] { };
ExtractionResult result = Kreuzberg.extractBytes(pdfBytes, "application/pdf", null);
System.out.println("Extracted " + result.getContent().length() + " characters");
} catch (KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
from kreuzberg import extract_file_sync, extract_bytes_sync, ExtractionConfig
from kreuzberg import (
KreuzbergError,
ParsingError,
OCRError,
ValidationError,
)
try:
result = extract_file_sync("document.pdf")
print(f"Extracted {len(result.content)} characters")
except FileNotFoundError as e:
print(f"File not found: {e}")
except ParsingError as e:
print(f"Failed to parse document: {e}")
except OCRError as e:
print(f"OCR processing failed: {e}")
except KreuzbergError as e:
print(f"Extraction error: {e}")
try:
config: ExtractionConfig = ExtractionConfig()
pdf_bytes: bytes = b"%PDF-1.4\n"
result = extract_bytes_sync(pdf_bytes, "application/pdf", config)
print(f"Extracted: {result.content[:100]}")
except ValidationError as e:
print(f"Invalid configuration: {e}")
except OCRError as e:
print(f"OCR failed: {e}")
except KreuzbergError as e:
print(f"Extraction failed: {e}")
require 'kreuzberg'
begin
result = Kreuzberg.extract_file_sync('document.pdf')
puts result.content
rescue Kreuzberg::ValidationError => e
puts "Invalid configuration: #{e.message}"
rescue Kreuzberg::ParsingError => e
puts "Failed to parse document: #{e.message}"
rescue Kreuzberg::OCRError => e
puts "OCR processing failed: #{e.message}"
rescue Kreuzberg::MissingDependencyError => e
puts "Missing dependency: #{e.message}"
rescue Kreuzberg::Error => e
puts "Extraction error: #{e.message}"
rescue StandardError => e
puts "System error: #{e.message}"
end
library(kreuzberg)
# Handle extraction errors with typed conditions
result <- tryCatch({
extract_file_sync("document.xyz")
},
UnsupportedFileType = function(e) {
cat("Error: File type not supported\n")
cat("Message:", conditionMessage(e), "\n")
NULL
},
ValidationError = function(e) {
cat("Error: Validation failed\n")
cat("Message:", conditionMessage(e), "\n")
NULL
},
kreuzberg_error = function(e) {
cat("Error: Kreuzberg extraction failed\n")
cat("Message:", conditionMessage(e), "\n")
NULL
}
)
if (!is.null(result)) {
cat("Extraction successful\n")
}
use kreuzberg::{extract_file_sync, extract_bytes_sync, ExtractionConfig, KreuzbergError};
fn main() -> kreuzberg::Result<()> {
match extract_file_sync("document.pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted {} characters", result.content.len());
}
Err(KreuzbergError::Parsing { message, .. }) => {
eprintln!("Failed to parse document: {}", message);
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR processing failed: {}", message);
}
Err(KreuzbergError::MissingDependency { message, .. }) => {
eprintln!("Missing dependency: {}", message);
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
}
}
let pdf_bytes = b"%PDF-1.4\n...";
match extract_bytes_sync(pdf_bytes, "application/pdf", None, &ExtractionConfig::default()) {
Ok(result) => {
println!("Extracted: {}", &result.content[..100.min(result.content.len())]);
Ok(())
}
Err(KreuzbergError::Validation { message, .. }) => {
eprintln!("Invalid configuration: {}", message);
Err(KreuzbergError::Validation {
message: message.clone(),
source: None,
})
}
Err(KreuzbergError::Ocr { message, .. }) => {
eprintln!("OCR failed: {}", message);
Err(KreuzbergError::Ocr {
message: message.clone(),
source: None,
})
}
Err(e) => {
eprintln!("Extraction failed: {}", e);
Err(e)
}
}
}
# Example: Handling extraction errors
case Kreuzberg.extract_file("document.pdf") do
{:ok, result} ->
IO.puts("Successfully extracted content")
IO.puts("Content length: #{byte_size(result.content)} characters")
{:error, reason} ->
IO.puts("Extraction failed: #{reason}")
end
# Example: Handling with custom error message
result = Kreuzberg.extract_file("nonexistent.pdf")
case result do
{:ok, data} ->
IO.puts("File processed successfully")
{:error, error} ->
IO.puts("Error details: #{inspect(error)}")
end
# Example: Extract with pattern matching
case Kreuzberg.extract(<<>>, "application/pdf") do
{:ok, result} ->
IO.puts("Content: #{result.content}")
{:error, msg} when is_binary(msg) ->
IO.puts("Validation error: #{msg}")
{:error, reason} ->
IO.puts("Unknown error: #{inspect(reason)}")
end
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
try {
const result = await extractFromFile(file);
console.log(result.content);
} catch (error) {
if (error instanceof Error) {
console.error(`Extraction error: ${error.message}`);
} else {
throw error;
}
}
}
OCR for Scanned Documents¶
Kreuzberg runs OCR automatically when it detects an image or scanned PDF. You can also force OCR on any document:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct ConfigBuilder *builder = kreuzberg_config_builder_new();
kreuzberg_config_builder_set_ocr(builder,
"{\"tesseract\":{\"language\":\"eng\"}}");
ExtractionConfig *config = kreuzberg_config_builder_build(builder);
char *config_json = kreuzberg_config_to_json(config);
struct CExtractionResult *result =
kreuzberg_extract_file_sync_with_config("scanned.png", config_json);
if (result && result->success) {
printf("OCR text: %s\n", result->content);
} else {
fprintf(stderr, "OCR error: %s\n", kreuzberg_get_error_details().message);
}
kreuzberg_free_result(result);
kreuzberg_free_string(config_json);
kreuzberg_config_free(config);
return 0;
}
package main
import (
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
lang := "eng"
cfg := &kreuzberg.ExtractionConfig{
OCR: &kreuzberg.OCRConfig{
Backend: "tesseract",
Language: &lang,
},
}
result, err := kreuzberg.ExtractFileSync("scanned.pdf", cfg)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
log.Println(len(result.Content))
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.config.ExtractionConfig;
import dev.kreuzberg.config.OcrConfig;
import java.io.IOException;
public class Main {
public static void main(String[] args) {
try {
ExtractionConfig config = ExtractionConfig.builder()
.ocr(OcrConfig.builder()
.backend("tesseract")
.language("eng")
.build())
.build();
ExtractionResult result = Kreuzberg.extractFile("scanned.pdf", config);
System.out.println(result.getContent());
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, OcrConfig
config: ExtractionConfig = ExtractionConfig(
ocr=OcrConfig(backend="tesseract", language="eng")
)
result = extract_file_sync("scanned.pdf", config=config)
content: str = result.content
preview: str = content[:100]
total_length: int = len(content)
print(f"Extracted content (preview): {preview}")
print(f"Total characters: {total_length}")
library(kreuzberg)
# Configure Tesseract OCR
ocr <- ocr_config(backend = "tesseract", language = "eng", dpi = 300L)
config <- extraction_config(force_ocr = TRUE, ocr = ocr)
# Extract text from a scanned image
result <- extract_file_sync("scan.png", config = config)
cat(sprintf("Extracted %d characters\n", nchar(result$content)))
cat(sprintf("Quality score: %s\n", result$quality_score))
cat("Content preview:\n")
cat(substr(result$content, 1, 200))
use kreuzberg::{extract_file_sync, ExtractionConfig, OcrConfig};
fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
ocr: Some(OcrConfig {
backend: "tesseract".to_string(),
language: "eng".to_string(),
..Default::default()
}),
..Default::default()
};
let result = extract_file_sync("scanned.pdf", None, &config)?;
println!("{}", result.content);
Ok(())
}
alias Kreuzberg.ExtractionConfig
config = %ExtractionConfig{
ocr: %{"enabled" => true, "backend" => "tesseract"}
}
{:ok, result} = Kreuzberg.extract_file("scanned_document.pdf", nil, config)
content = result.content
IO.puts("OCR Extracted content:")
IO.puts(content)
IO.puts("Metadata: #{inspect(result.metadata)}")
import { enableOcr, extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
await enableOcr();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file, file.type, {
ocr: {
backend: 'kreuzberg-tesseract',
language: 'eng',
},
});
console.log(result.content);
}
import { enableOcr, extractFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
await enableOcr(); // Uses native kreuzberg-tesseract backend
const result = await extractFile('./scanned_document.png', 'image/png', {
ocr: {
backend: 'kreuzberg-tesseract',
language: 'eng',
},
});
console.log(result.content);
Process Multiple Files¶
Pass a list of paths to extract them in parallel:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
const char *files[] = {"doc1.pdf", "doc2.docx", "doc3.txt"};
uintptr_t count = 3;
struct CBatchResult *batch = kreuzberg_batch_extract_files_sync(files, count, NULL);
if (!batch) {
fprintf(stderr, "Batch error: %s\n", kreuzberg_get_error_details().message);
return 1;
}
for (uintptr_t i = 0; i < batch->count; i++) {
struct CExtractionResult *r = batch->results[i];
if (r && r->success) {
printf("--- %s ---\n%s\n", files[i], r->content);
}
}
kreuzberg_free_batch_result(batch);
return 0;
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
files := []string{"doc1.pdf", "doc2.docx", "doc3.pptx"}
results, err := kreuzberg.BatchExtractFilesSync(files, nil)
if err != nil {
log.Fatalf("batch extract failed: %v", err)
}
for i, result := range results {
if result == nil {
continue
}
fmt.Printf("File %d: %d characters\n", i+1, len(result.Content))
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
try {
List<String> files = Arrays.asList("doc1.pdf", "doc2.docx", "doc3.pptx");
List<ExtractionResult> results = Kreuzberg.batchExtractFiles(files, null);
for (int i = 0; i < results.size(); i++) {
ExtractionResult result = results.get(i);
System.out.println("File " + (i + 1) + ": " + result.getContent().length() + " characters");
}
} catch (IOException | KreuzbergException e) {
e.printStackTrace();
}
from kreuzberg import batch_extract_files_sync, ExtractionConfig
files: list[str] = ["doc1.pdf", "doc2.docx", "doc3.pptx"]
config: ExtractionConfig = ExtractionConfig()
results = batch_extract_files_sync(files, config=config)
for i, result in enumerate(results):
char_count: int = len(result.content)
print(f"File {i + 1}: {char_count} characters")
library(kreuzberg)
# Define file paths to extract
file_paths <- c(
"documents/report.pdf",
"documents/summary.docx",
"documents/data.xlsx"
)
# Batch extract files
results <- batch_extract_files_sync(file_paths)
# Process results
for (i in seq_along(results)) {
result <- results[[i]]
cat(sprintf("File %d: %s\n", i, file_paths[i]))
cat(sprintf(" Pages: %d\n", page_count(result)))
cat(sprintf(" Elements: %d\n", length(result$elements)))
}
use kreuzberg::{batch_extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let files = vec!["doc1.pdf", "doc2.docx", "doc3.pptx"];
let config = ExtractionConfig::default();
let results = batch_extract_file_sync(files, &config)?;
for (i, result) in results.iter().enumerate() {
println!("File {}: {} characters", i + 1, result.content.len());
}
Ok(())
}
file_paths = ["document1.pdf", "document2.pdf", "document3.pdf"]
{:ok, results} = Kreuzberg.batch_extract_files(file_paths)
Enum.each(results, fn result ->
IO.puts("File: #{result.mime_type}")
IO.puts("Content length: #{byte_size(result.content)} characters")
IO.puts("Tables: #{length(result.tables)}")
IO.puts("---")
end)
IO.puts("Total files processed: #{length(results)}")
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInputs = document.getElementById('files') as HTMLInputElement;
const files = Array.from(fileInputs.files || []);
const results = await Promise.all(
files.map((file) => extractFromFile(file))
);
results.forEach((result, i) => {
console.log(`File ${i + 1}: ${result.content.length} characters`);
});
Read Document Metadata¶
Every extraction result includes format-specific metadata — page count for PDFs, sheet names for Excel, dimensions for images:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct CExtractionResult *result = kreuzberg_extract_file_sync("document.pdf");
if (!result || !result->success) {
fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
return 1;
}
printf("Content: %s\n", result->content);
printf("MIME: %s\n", result->mime_type);
if (result->language)
printf("Language: %s\n", result->language);
if (result->date)
printf("Date: %s\n", result->date);
if (result->subject)
printf("Subject: %s\n", result->subject);
if (result->metadata_json)
printf("Metadata: %s\n", result->metadata_json);
kreuzberg_free_result(result);
return 0;
}
using Kreuzberg;
var config = new ExtractionConfig
{
PdfOptions = new PdfConfig { ExtractMetadata = true }
};
var result = KreuzbergClient.ExtractFileSync("document.pdf", config);
if (result.Metadata?.Format.Pdf != null)
{
var pdfMeta = result.Metadata.Format.Pdf;
Console.WriteLine($"Pages: {pdfMeta.PageCount}");
Console.WriteLine($"Author: {pdfMeta.Author}");
Console.WriteLine($"Title: {pdfMeta.Title}");
}
var htmlResult = KreuzbergClient.ExtractFileSync("page.html", config);
if (htmlResult.Metadata?.Format.Html != null)
{
var htmlMeta = htmlResult.Metadata.Format.Html;
Console.WriteLine($"Title: {htmlMeta.Title}");
Console.WriteLine($"Description: {htmlMeta.Description}");
// Access keywords as array
if (htmlMeta.Keywords != null && htmlMeta.Keywords.Count > 0)
{
Console.WriteLine($"Keywords: {string.Join(", ", htmlMeta.Keywords)}");
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.CanonicalUrl != null)
{
Console.WriteLine($"Canonical URL: {htmlMeta.CanonicalUrl}");
}
// Access Open Graph fields from dictionary
if (htmlMeta.OpenGraph != null && htmlMeta.OpenGraph.Count > 0)
{
if (htmlMeta.OpenGraph.ContainsKey("image"))
Console.WriteLine($"Open Graph Image: {htmlMeta.OpenGraph["image"]}");
if (htmlMeta.OpenGraph.ContainsKey("title"))
Console.WriteLine($"Open Graph Title: {htmlMeta.OpenGraph["title"]}");
if (htmlMeta.OpenGraph.ContainsKey("type"))
Console.WriteLine($"Open Graph Type: {htmlMeta.OpenGraph["type"]}");
}
// Access Twitter Card fields from dictionary
if (htmlMeta.TwitterCard != null && htmlMeta.TwitterCard.Count > 0)
{
if (htmlMeta.TwitterCard.ContainsKey("card"))
Console.WriteLine($"Twitter Card Type: {htmlMeta.TwitterCard["card"]}");
if (htmlMeta.TwitterCard.ContainsKey("creator"))
Console.WriteLine($"Twitter Creator: {htmlMeta.TwitterCard["creator"]}");
}
// Access new fields
if (htmlMeta.Language != null)
Console.WriteLine($"Language: {htmlMeta.Language}");
if (htmlMeta.TextDirection != null)
Console.WriteLine($"Text Direction: {htmlMeta.TextDirection}");
// Access headers
if (htmlMeta.Headers != null && htmlMeta.Headers.Count > 0)
Console.WriteLine($"Headers: {string.Join(", ", htmlMeta.Headers.Select(h => h.Text))}");
// Access links
if (htmlMeta.Links != null && htmlMeta.Links.Count > 0)
{
foreach (var link in htmlMeta.Links)
Console.WriteLine($"Link: {link.Href} ({link.Text})");
}
// Access images
if (htmlMeta.Images != null && htmlMeta.Images.Count > 0)
Console.WriteLine($"Images: {string.Join(", ", htmlMeta.Images.Select(i => i.Src))}");
// Access structured data
if (htmlMeta.StructuredData != null && htmlMeta.StructuredData.Count > 0)
Console.WriteLine($"Structured Data items: {htmlMeta.StructuredData.Count}");
}
package main
import (
"fmt"
"log"
"strings"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract pdf: %v", err)
}
// Access PDF metadata
if pdf, ok := result.Metadata.PdfMetadata(); ok {
if pdf.PageCount != nil {
fmt.Printf("Pages: %d\n", *pdf.PageCount)
}
if pdf.Author != nil {
fmt.Printf("Author: %s\n", *pdf.Author)
}
if pdf.Title != nil {
fmt.Printf("Title: %s\n", *pdf.Title)
}
}
// Access HTML metadata
htmlResult, err := kreuzberg.ExtractFileSync("page.html", nil)
if err != nil {
log.Fatalf("extract html: %v", err)
}
if html, ok := htmlResult.Metadata.HTMLMetadata(); ok {
if html.Title != nil {
fmt.Printf("Title: %s\n", *html.Title)
}
if html.Description != nil {
fmt.Printf("Description: %s\n", *html.Description)
}
// Access keywords as array
if len(html.Keywords) > 0 {
fmt.Printf("Keywords: %s\n", strings.Join(html.Keywords, ", "))
}
// Access canonical URL (renamed from canonical)
if html.CanonicalURL != nil {
fmt.Printf("Canonical URL: %s\n", *html.CanonicalURL)
}
// Access Open Graph fields from map
if len(html.OpenGraph) > 0 {
if image, ok := html.OpenGraph["image"]; ok {
fmt.Printf("Open Graph Image: %s\n", image)
}
if ogTitle, ok := html.OpenGraph["title"]; ok {
fmt.Printf("Open Graph Title: %s\n", ogTitle)
}
if ogType, ok := html.OpenGraph["type"]; ok {
fmt.Printf("Open Graph Type: %s\n", ogType)
}
}
// Access Twitter Card fields from map
if len(html.TwitterCard) > 0 {
if card, ok := html.TwitterCard["card"]; ok {
fmt.Printf("Twitter Card Type: %s\n", card)
}
if creator, ok := html.TwitterCard["creator"]; ok {
fmt.Printf("Twitter Creator: %s\n", creator)
}
}
// Access new fields
if html.Language != nil {
fmt.Printf("Language: %s\n", *html.Language)
}
if html.TextDirection != nil {
fmt.Printf("Text Direction: %s\n", *html.TextDirection)
}
// Access headers
if len(html.Headers) > 0 {
headers := make([]string, len(html.Headers))
for i, h := range html.Headers {
headers[i] = h.Text
}
fmt.Printf("Headers: %s\n", strings.Join(headers, ", "))
}
// Access links
if len(html.Links) > 0 {
for _, link := range html.Links {
fmt.Printf("Link: %s (%s)\n", link.Href, link.Text)
}
}
// Access images
if len(html.Images) > 0 {
for _, image := range html.Images {
fmt.Printf("Image: %s\n", image.Src)
}
}
// Access structured data
if len(html.StructuredData) > 0 {
fmt.Printf("Structured data items: %d\n", len(html.StructuredData))
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.Metadata;
import dev.kreuzberg.KreuzbergException;
import java.io.IOException;
import java.util.Map;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFileSync("document.pdf");
// Metadata is flat — format-specific fields are at the top level
Metadata metadata = result.getMetadata();
metadata.getTitle().ifPresent(t -> System.out.println("Title: " + t));
metadata.getAuthors().ifPresent(a -> System.out.println("Authors: " + String.join(", ", a)));
// Format-specific fields are in the additional map
Map<String, Object> extra = metadata.getAdditional();
if (extra.get("page_count") != null) {
System.out.println("Pages: " + extra.get("page_count"));
}
// Access HTML metadata
ExtractionResult htmlResult = Kreuzberg.extractFileSync("page.html");
Metadata htmlMeta = htmlResult.getMetadata();
htmlMeta.getTitle().ifPresent(t -> System.out.println("Title: " + t));
Map<String, Object> htmlExtra = htmlMeta.getAdditional();
String description = (String) htmlExtra.get("description");
if (description != null) {
System.out.println("Description: " + description);
}
// Access keywords as array
htmlMeta.getKeywords().ifPresent(keywords ->
System.out.println("Keywords: " + keywords));
// Access canonical URL (renamed from canonical)
String canonicalUrl = (String) htmlExtra.get("canonical_url");
if (canonicalUrl != null) {
System.out.println("Canonical URL: " + canonicalUrl);
}
// Access Open Graph fields from map
@SuppressWarnings("unchecked")
Map<String, String> openGraph = (Map<String, String>) htmlExtra.get("open_graph");
if (openGraph != null) {
System.out.println("Open Graph Image: " + openGraph.get("image"));
System.out.println("Open Graph Title: " + openGraph.get("title"));
System.out.println("Open Graph Type: " + openGraph.get("type"));
}
// Access Twitter Card fields from map
@SuppressWarnings("unchecked")
Map<String, String> twitterCard = (Map<String, String>) htmlExtra.get("twitter_card");
if (twitterCard != null) {
System.out.println("Twitter Card Type: " + twitterCard.get("card"));
System.out.println("Twitter Creator: " + twitterCard.get("creator"));
}
// Access new fields
htmlMeta.getLanguage().ifPresent(l -> System.out.println("Language: " + l));
String textDirection = (String) htmlExtra.get("text_direction");
if (textDirection != null) {
System.out.println("Text Direction: " + textDirection);
}
// Access headers
@SuppressWarnings("unchecked")
List<Map<String, Object>> headers = (List<Map<String, Object>>) htmlExtra.get("headers");
if (headers != null) {
headers.stream()
.map(h -> h.get("text"))
.forEach(text -> System.out.print(text + ", "));
System.out.println();
}
// Access links
@SuppressWarnings("unchecked")
List<Map<String, Object>> links = (List<Map<String, Object>>) htmlExtra.get("links");
if (links != null) {
for (Map<String, Object> link : links) {
System.out.println("Link: " + link.get("href") + " (" + link.get("text") + ")");
}
}
// Access images
@SuppressWarnings("unchecked")
List<Map<String, Object>> images = (List<Map<String, Object>>) htmlExtra.get("images");
if (images != null) {
for (Map<String, Object> image : images) {
System.out.println("Image: " + image.get("src"));
}
}
// Access structured data
@SuppressWarnings("unchecked")
List<Map<String, Object>> structuredData = (List<Map<String, Object>>) htmlExtra.get("structured_data");
if (structuredData != null) {
System.out.println("Structured data items: " + structuredData.size());
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig
result = extract_file_sync("document.pdf", config=ExtractionConfig())
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata.get("page_count"):
print(f"Pages: {metadata['page_count']}")
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("authors"):
print(f"Authors: {', '.join(metadata['authors'])}")
result = extract_file_sync("page.html", config=ExtractionConfig())
metadata = result.metadata
if metadata.get("title"):
print(f"Title: {metadata['title']}")
if metadata.get("description"):
print(f"Description: {metadata['description']}")
# Access keywords as array
keywords = metadata.get('keywords', [])
if keywords:
print(f"Keywords: {', '.join(keywords)}")
# Access canonical URL (renamed from canonical)
canonical_url = metadata.get('canonical_url')
if canonical_url:
print(f"Canonical URL: {canonical_url}")
# Access Open Graph fields from map
open_graph = metadata.get('open_graph', {})
if open_graph:
if 'image' in open_graph:
print(f"Open Graph Image: {open_graph['image']}")
if 'title' in open_graph:
print(f"Open Graph Title: {open_graph['title']}")
if 'type' in open_graph:
print(f"Open Graph Type: {open_graph['type']}")
# Access Twitter Card fields from map
twitter_card = metadata.get('twitter_card', {})
if twitter_card:
if 'card' in twitter_card:
print(f"Twitter Card Type: {twitter_card['card']}")
if 'creator' in twitter_card:
print(f"Twitter Creator: {twitter_card['creator']}")
# Access new fields
language = metadata.get('language')
if language:
print(f"Language: {language}")
text_direction = metadata.get('text_direction')
if text_direction:
print(f"Text Direction: {text_direction}")
# Access headers
headers = metadata.get('headers', [])
if headers:
print(f"Headers: {', '.join([h['text'] for h in headers])}")
# Access links
links = metadata.get('links', [])
if links:
for link in links:
print(f"Link: {link.get('href')} ({link.get('text')})")
# Access images
images = metadata.get('images', [])
if images:
for image in images:
print(f"Image: {image.get('src')}")
# Access structured data
structured_data = metadata.get('structured_data', [])
if structured_data:
print(f"Structured data items: {len(structured_data)}")
require 'kreuzberg'
result = Kreuzberg.extract_file_sync('document.pdf')
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
if metadata['page_count']
puts "Pages: #{metadata['page_count']}"
end
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['authors']
puts "Authors: #{metadata['authors'].join(', ')}"
end
# Access HTML metadata
html_result = Kreuzberg.extract_file_sync('page.html')
metadata = html_result.metadata
if metadata['title']
puts "Title: #{metadata['title']}"
end
if metadata['description']
puts "Description: #{metadata['description']}"
end
# Access keywords as array
if metadata['keywords']
puts "Keywords: #{metadata['keywords'].join(', ')}"
end
# Access canonical URL (renamed from canonical)
puts "Canonical URL: #{metadata['canonical_url']}" if metadata['canonical_url']
# Access Open Graph fields from map
open_graph = metadata['open_graph'] || {}
puts "Open Graph Image: #{open_graph['image']}" if open_graph['image']
puts "Open Graph Title: #{open_graph['title']}" if open_graph['title']
puts "Open Graph Type: #{open_graph['type']}" if open_graph['type']
# Access Twitter Card fields from map
twitter_card = metadata['twitter_card'] || {}
puts "Twitter Card Type: #{twitter_card['card']}" if twitter_card['card']
puts "Twitter Creator: #{twitter_card['creator']}" if twitter_card['creator']
# Access new fields
puts "Language: #{metadata['language']}" if metadata['language']
puts "Text Direction: #{metadata['text_direction']}" if metadata['text_direction']
# Access headers
if metadata['headers']
puts "Headers: #{metadata['headers'].map { |h| h['text'] }.join(', ')}"
end
# Access links
if metadata['links']
metadata['links'].each do |link|
puts "Link: #{link['href']} (#{link['text']})"
end
end
# Access images
if metadata['images']
metadata['images'].each do |image|
puts "Image: #{image['src']}"
end
end
# Access structured data
if metadata['structured_data']
puts "Structured data items: #{metadata['structured_data'].length}"
end
library(kreuzberg)
result <- extract_file_sync("document.pdf")
cat("Detected Language:", result$detected_language, "\n")
cat("Quality Score:", result$quality_score, "\n")
cat("Keywords:", paste(result$keywords, collapse=", "), "\n\n")
cat("Metadata fields:\n")
authors <- metadata_field(result, "authors")
if (!is.null(authors)) {
cat("Authors:", paste(authors, collapse=", "), "\n")
}
created <- metadata_field(result, "created_date")
if (!is.null(created)) {
cat("Created Date:", created, "\n")
}
pages_meta <- metadata_field(result, "page_count")
if (!is.null(pages_meta)) {
cat("Pages:", pages_meta, "\n")
}
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
if let Some(pdf_meta) = result.metadata.pdf {
if let Some(pages) = pdf_meta.page_count {
println!("Pages: {}", pages);
}
if let Some(author) = pdf_meta.author {
println!("Author: {}", author);
}
if let Some(title) = pdf_meta.title {
println!("Title: {}", title);
}
}
let html_result = extract_file_sync("page.html", None, &ExtractionConfig::default())?;
if let Some(html_meta) = html_result.metadata.html {
if let Some(title) = html_meta.title {
println!("Title: {}", title);
}
if let Some(desc) = html_meta.description {
println!("Description: {}", desc);
}
// Access keywords array
println!("Keywords: {:?}", html_meta.keywords);
// Access canonical URL (renamed from canonical)
if let Some(canonical) = html_meta.canonical_url {
println!("Canonical URL: {}", canonical);
}
// Access Open Graph fields as a map
if let Some(og_image) = html_meta.open_graph.get("image") {
println!("Open Graph Image: {}", og_image);
}
if let Some(og_title) = html_meta.open_graph.get("title") {
println!("Open Graph Title: {}", og_title);
}
// Access Twitter Card fields as a map
if let Some(twitter_card) = html_meta.twitter_card.get("card") {
println!("Twitter Card Type: {}", twitter_card);
}
// Access new fields
if let Some(lang) = html_meta.language {
println!("Language: {}", lang);
}
// Access headers
if !html_meta.headers.is_empty() {
for header in &html_meta.headers {
println!("Header (level {}): {}", header.level, header.text);
}
}
// Access links
if !html_meta.links.is_empty() {
for link in &html_meta.links {
println!("Link: {} ({})", link.href, link.text);
}
}
// Access images
if !html_meta.images.is_empty() {
for image in &html_meta.images {
println!("Image: {}", image.src);
}
}
// Access structured data
if !html_meta.structured_data.is_empty() {
println!("Structured data items: {}", html_meta.structured_data.len());
}
}
Ok(())
}
{:ok, result} = Kreuzberg.extract_file("document.pdf")
# Metadata is flat — format-specific fields are at the top level
metadata = result.metadata
IO.puts("MIME type: #{result.mime_type}")
IO.puts("All metadata keys: #{inspect(Map.keys(metadata))}")
# Access PDF metadata directly from the flat map
page_count = metadata["page_count"]
if page_count, do: IO.puts("Page count: #{page_count}")
authors = metadata["authors"] || []
if authors != [], do: IO.puts("Authors: #{Enum.join(authors, ", ")}")
title = metadata["title"]
if title, do: IO.puts("Title: #{title}")
# Access HTML metadata directly from the flat map
{:ok, html_result} = Kreuzberg.extract_file("page.html")
html_meta = html_result.metadata
keywords = html_meta["keywords"] || []
if keywords != [], do: IO.puts("Keywords: #{Enum.join(keywords, ", ")}")
description = html_meta["description"]
if description, do: IO.puts("Description: #{description}")
import { extractFileSync } from '@kreuzberg/node';
const result = extractFileSync('document.pdf');
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
if (result.metadata.pageCount) {
console.log(`Pages: ${result.metadata.pageCount}`);
}
const htmlResult = extractFileSync('page.html');
console.log(`HTML Metadata: ${JSON.stringify(htmlResult.metadata)}`);
const htmlMeta = htmlResult.metadata;
if (htmlMeta.title) {
console.log(`Title: ${htmlMeta.title}`);
}
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(', ')}`);
}
// Access canonical URL (renamed from canonical)
if (htmlMeta.canonicalUrl) {
console.log(`Canonical URL: ${htmlMeta.canonicalUrl}`);
}
// Access Open Graph fields from map
if (htmlMeta.openGraph) {
if (htmlMeta.openGraph['image']) {
console.log(`Open Graph Image: ${htmlMeta.openGraph['image']}`);
}
if (htmlMeta.openGraph['title']) {
console.log(`Open Graph Title: ${htmlMeta.openGraph['title']}`);
}
if (htmlMeta.openGraph['type']) {
console.log(`Open Graph Type: ${htmlMeta.openGraph['type']}`);
}
}
// Access Twitter Card fields from map
if (htmlMeta.twitterCard) {
if (htmlMeta.twitterCard['card']) {
console.log(`Twitter Card Type: ${htmlMeta.twitterCard['card']}`);
}
if (htmlMeta.twitterCard['creator']) {
console.log(`Twitter Creator: ${htmlMeta.twitterCard['creator']}`);
}
}
// Access new fields
if (htmlMeta.language) {
console.log(`Language: ${htmlMeta.language}`);
}
if (htmlMeta.textDirection) {
console.log(`Text Direction: ${htmlMeta.textDirection}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map(h => h.text).join(', ')}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structuredData && htmlMeta.structuredData.length > 0) {
console.log(`Structured data items: ${htmlMeta.structuredData.length}`);
}
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
// Access common metadata fields
if (result.metadata.title) {
console.log(`Title: ${result.metadata.title}`);
}
// Access format-specific metadata
const metadata = result.metadata;
// For HTML files
if (metadata.html) {
const htmlMeta = metadata.html;
console.log(`HTML Title: ${htmlMeta.title}`);
console.log(`Description: ${htmlMeta.description}`);
// Access keywords as array
if (htmlMeta.keywords && htmlMeta.keywords.length > 0) {
console.log(`Keywords: ${htmlMeta.keywords.join(', ')}`);
}
// Access canonical URL
if (htmlMeta.canonical_url) {
console.log(`Canonical URL: ${htmlMeta.canonical_url}`);
}
// Access Open Graph fields
if (htmlMeta.open_graph) {
if (htmlMeta.open_graph['title']) {
console.log(`OG Title: ${htmlMeta.open_graph['title']}`);
}
if (htmlMeta.open_graph['image']) {
console.log(`OG Image: ${htmlMeta.open_graph['image']}`);
}
}
// Access Twitter Card fields
if (htmlMeta.twitter_card && htmlMeta.twitter_card['card']) {
console.log(`Twitter Card Type: ${htmlMeta.twitter_card['card']}`);
}
// Access headers
if (htmlMeta.headers && htmlMeta.headers.length > 0) {
console.log(`Headers: ${htmlMeta.headers.map((h: any) => h.text).join(', ')}`);
}
// Access links
if (htmlMeta.links && htmlMeta.links.length > 0) {
htmlMeta.links.forEach((link: any) => {
console.log(`Link: ${link.href} (${link.text})`);
});
}
// Access images
if (htmlMeta.images && htmlMeta.images.length > 0) {
htmlMeta.images.forEach((image: any) => {
console.log(`Image: ${image.src}`);
});
}
// Access structured data
if (htmlMeta.structured_data && htmlMeta.structured_data.length > 0) {
console.log(`Structured data items: ${htmlMeta.structured_data.length}`);
}
}
// PDF-specific fields are at the top level of metadata
if (metadata.pageCount) {
console.log(`Pages: ${metadata.pageCount}`);
}
if (metadata.authors && metadata.authors.length > 0) {
console.log(`Authors: ${metadata.authors.join(', ')}`);
}
}
Extract and parse metadata using JSON output:
# Extract with metadata (JSON format includes metadata automatically)
kreuzberg extract document.pdf --format json
# Save to file and parse metadata
kreuzberg extract document.pdf --format json > result.json
# Print all metadata fields
cat result.json | jq '.metadata'
# Extract HTML metadata
kreuzberg extract page.html --format json | jq '.metadata'
# Get specific fields
kreuzberg extract document.pdf --format json | \
jq '.metadata | {page_count, authors, title}'
# Process multiple files
kreuzberg batch documents/*.pdf --format json > all_metadata.json
JSON Output Structure:
Kreuzberg extracts format-specific metadata for:
- PDF: page count, title, authors (list), creation date, modification date
- HTML: SEO tags, Open Graph, Twitter Card, structured data, headers, links, images
- Excel: sheet count, sheet names
- Email: from, to, CC, BCC, message ID, attachments
- PowerPoint: title, author, description, fonts
- Images: dimensions, format, EXIF data
- Archives: format, file count, file list, sizes
- XML: element count, unique elements
- Text/Markdown: word count, line count, headers, links
See Types Reference for complete metadata reference.
Extract Tables¶
Tables come back as both structured cells and Markdown. Kreuzberg extracts them from PDFs, spreadsheets, and HTML:
#include "kreuzberg.h"
#include <stdio.h>
int main(void) {
struct CExtractionResult *result = kreuzberg_extract_file_sync("spreadsheet.xlsx");
if (!result || !result->success) {
fprintf(stderr, "Error: %s\n", kreuzberg_get_error_details().message);
return 1;
}
if (result->tables_json) {
printf("Tables (JSON): %s\n", result->tables_json);
} else {
printf("No tables found\n");
}
kreuzberg_free_result(result);
return 0;
}
using Kreuzberg;
var result = KreuzbergClient.ExtractFileSync("document.pdf", new ExtractionConfig());
foreach (var table in result.Tables)
{
Console.WriteLine($"Table with {table.Cells.Count} rows");
Console.WriteLine(table.Markdown);
foreach (var row in table.Cells)
{
Console.WriteLine(string.Join(" | ", row));
}
}
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
result, err := kreuzberg.ExtractFileSync("document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
// Iterate over tables
for _, table := range result.Tables {
fmt.Printf("Table with %d rows\n", len(table.Cells))
fmt.Println(table.Markdown) // Markdown representation
// Access cells
for _, row := range table.Cells {
fmt.Println(row)
}
}
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import dev.kreuzberg.KreuzbergException;
import dev.kreuzberg.Table;
import java.io.IOException;
import java.util.List;
public class Main {
public static void main(String[] args) {
try {
ExtractionResult result = Kreuzberg.extractFile("document.pdf");
for (Table table : result.getTables()) {
System.out.println("Table with " + table.cells().size() + " rows");
System.out.println(table.markdown());
for (List<String> row : table.cells()) {
System.out.println(row);
}
}
} catch (IOException | KreuzbergException e) {
System.err.println("Extraction failed: " + e.getMessage());
}
}
}
from kreuzberg import extract_file_sync, ExtractionConfig, ExtractedTable
result = extract_file_sync("document.pdf", config=ExtractionConfig())
for table in result.tables:
row_count: int = len(table.cells)
print(f"Table with {row_count} rows")
print(table.markdown)
for row in table.cells:
print(row)
library(kreuzberg)
result <- extract_file_sync("spreadsheet.xlsx")
cat("Tables extracted:", length(result$tables), "\n\n")
for (i in seq_along(result$tables)) {
table <- result$tables[[i]]
cat(sprintf("Table %d:\n", i))
cat(" Rows:", nrow(table), "\n")
cat(" Columns:", ncol(table), "\n")
cat(" Column names:", paste(colnames(table), collapse=", "), "\n")
cat("\n")
if (nrow(table) > 0L) {
cat(" Preview (first 3 rows):\n")
print(head(table, 3L))
cat("\n")
}
}
use kreuzberg::{extract_file_sync, ExtractionConfig};
fn main() -> kreuzberg::Result<()> {
let result = extract_file_sync("document.pdf", None, &ExtractionConfig::default())?;
for table in &result.tables {
println!("Table with {} rows", table.cells.len());
println!("{}", table.markdown);
for row in &table.cells {
println!("{:?}", row);
}
}
Ok(())
}
{:ok, result} = Kreuzberg.extract_file("document.pdf")
tables = result.tables
IO.puts("Total tables found: #{length(tables)}")
Enum.with_index(tables, 1) |> Enum.each(fn {table, index} ->
IO.puts("\n--- Table #{index} ---")
# Access table cells
cells = table["cells"] || []
IO.puts("Rows: #{length(cells)}")
# Access table markdown representation
markdown = table["markdown"]
IO.puts("Markdown representation:")
IO.puts(markdown)
end)
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
for (const table of result.tables) {
console.log(`Table with ${table.cells.length} rows`);
console.log(`Page: ${table.pageNumber}`);
console.log(table.markdown);
}
}
Extract and process tables from documents:
# Extract with JSON format (includes tables when detected)
kreuzberg extract document.pdf --format json
# Save tables to JSON
kreuzberg extract spreadsheet.xlsx --format json > tables.json
# Extract and parse table markdown
kreuzberg extract document.pdf --format json | \
jq '.tables[]? | .markdown'
# Get table cells
kreuzberg extract document.pdf --format json | \
jq '.tables[]? | .cells'
# Batch extract tables from multiple files
kreuzberg batch documents/**/*.pdf --format json > all_tables.json
JSON Table Structure:
Going Async¶
Use async extraction in web servers, background workers, or anywhere you need non-blocking I/O:
Not Applicable
The C FFI provides synchronous extraction only. Use kreuzberg_extract_file_sync
for file extraction. For concurrent extraction, use multiple threads with
kreuzberg_extract_file_sync — the API is fully thread-safe.
package main
import (
"context"
"fmt"
"log"
"time"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
result, err := kreuzberg.ExtractFile(ctx, "document.pdf", nil)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
fmt.Println(result.Content)
}
import dev.kreuzberg.Kreuzberg;
import dev.kreuzberg.ExtractionResult;
import java.nio.file.Path;
import java.util.concurrent.CompletableFuture;
public class Example {
public static void main(String[] args) {
CompletableFuture<ExtractionResult> future =
Kreuzberg.extractFileAsync(Path.of("document.pdf"), null);
future.thenAccept(result -> {
System.out.println(result.getContent());
System.out.println("Tables: " + result.getTables().size());
}).join();
}
}
import asyncio
from kreuzberg import extract_file, ExtractionConfig
async def main() -> None:
config: ExtractionConfig = ExtractionConfig()
result = await extract_file("document.pdf", config=config)
content: str = result.content
table_count: int = len(result.tables)
print(f"Content length: {len(content)} characters")
print(f"Tables: {table_count}")
asyncio.run(main())
library(kreuzberg)
# Note: extract_file() blocks in R despite being async
result <- extract_file("path/to/document.docx")
# Access extraction results
cat("Extracted", length(result$elements), "elements\n")
cat("Detected language:", result$detected_language, "\n")
cat("Tables found:", length(result$tables), "\n")
if (!is.null(result$keywords)) {
cat("Keywords:", paste(result$keywords, collapse = ", "), "\n")
}
task = Kreuzberg.extract_file_async("document.pdf")
{:ok, result} = Task.await(task)
content = result.content
table_count = length(result.tables)
metadata = result.metadata
IO.puts("Content length: #{byte_size(content)} characters")
IO.puts("Tables: #{table_count}")
IO.puts("Metadata keys: #{inspect(Map.keys(metadata))}")
import { extractFromFile, initWasm } from '@kreuzberg/wasm';
await initWasm();
const fileInput = document.getElementById('file') as HTMLInputElement;
const file = fileInput.files?.[0];
if (file) {
const result = await extractFromFile(file);
const content = result.content;
const tableCount = result.tables.length;
console.log(`Content length: ${content.length} characters`);
console.log(`Tables: ${tableCount}`);
}
Not Applicable
Async extraction is an API-level feature. The CLI operates synchronously. Use language-specific bindings (Python, TypeScript, Rust, WASM) for async operations.
Next Steps¶
You've covered the core API. Go deeper:
- Configuration Guide — OCR backends, chunking, language detection, config files
- Extract from Bytes — Process in-memory data without writing to disk
- OCR Setup — Tesseract, PaddleOCR, EasyOCR backends
- Types Reference — Full metadata fields for every format
- Docker Deployment — Run Kreuzberg in containers
- API Reference — Complete API documentation