Keyword Extraction¶
Extract ranked keywords from document text using YAKE or RAKE algorithms.
| Algorithm | Scoring | Best for |
|---|---|---|
| YAKE | Lower score = more relevant (0.0–1.0) | General documents, single terms, multilingual |
| RAKE | Higher score = more relevant (unbounded) | Multi-word phrases, technical docs |
Quick Start¶
Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3
)
)
result = await extract_file("research_paper.pdf", config=config)
keywords: list = result.extracted_keywords or []
for kw in keywords:
score: float = kw.score or 0.0
text: str = kw.text or ""
print(f"{text}: {score:.3f}")
asyncio.run(main())
TypeScript
import { extractFile } from '@kreuzberg/node';
const config = {
keywords: {
algorithm: 'yake',
maxKeywords: 10,
minScore: 0.3,
},
};
const result = await extractFile('research_paper.pdf', null, config);
console.log(`Content length: ${result.content.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
Rust
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.3,
..Default::default()
}),
..Default::default()
};
let result = extract_file("research_paper.pdf", None, &config).await?;
if let Some(keywords) = &result.extracted_keywords {
println!("Keywords: {:?}", keywords);
}
Go
package main
import (
"fmt"
"log"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
MinScore: 0.3,
},
}
result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
if err != nil {
log.Fatalf("extract failed: %v", err)
}
if keywords, ok := result.Metadata.Additional["keywords"]; ok {
fmt.Printf("Keywords: %v\n", keywords)
}
}
C#
using Kreuzberg;
using System.Collections.Generic;
var config = new ExtractionConfig
{
Keywords = new KeywordConfig
{
Algorithm = KeywordAlgorithm.Yake,
MaxKeywords = 10,
MinScore = 0.3
}
};
var result = await KreuzbergClient.ExtractFileAsync(
"research_paper.pdf",
config
);
if (result.Metadata.ContainsKey("keywords"))
{
var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
foreach (var kw in keywords)
{
var text = (string)kw["text"];
var score = (double)kw["score"];
Console.WriteLine($"{text}: {score:F3}");
}
}
Ruby
require 'kreuzberg'
config = Kreuzberg::Config::Extraction.new(
keywords: Kreuzberg::Config::Keywords.new(
algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
max_keywords: 10,
min_score: 0.3
)
)
result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)
keywords = result.extracted_keywords
keywords.each do |kw|
puts "#{kw['text']}: #{kw['score'].round(3)}"
end
Keywords are returned in result.extracted_keywords as objects with text and score fields.
Configuration¶
| Parameter | Type | Default | Description |
|---|---|---|---|
algorithm |
KeywordAlgorithm |
YAKE |
YAKE or RAKE |
max_keywords |
int |
10 |
Maximum keywords to extract |
min_score |
float |
0.0 |
Score threshold (upper bound for YAKE, lower bound for RAKE) |
ngram_range |
tuple[int, int] |
(1, 3) |
Min and max phrase length in words |
language |
str \| None |
"en" |
Language for stopword filtering (None disables) |
yake_params |
YakeParams |
— | YAKE-specific tuning |
rake_params |
RakeParams |
— | RAKE-specific tuning |
Python
import asyncio
from kreuzberg import (
ExtractionConfig,
KeywordConfig,
KeywordAlgorithm,
extract_file,
)
async def main() -> None:
config: ExtractionConfig = ExtractionConfig(
keywords=KeywordConfig(
algorithm=KeywordAlgorithm.YAKE,
max_keywords=10,
min_score=0.3,
ngram_range=(1, 3),
language="en"
)
)
result = await extract_file("document.pdf", config=config)
print(f"Content extracted: {len(result.content)} chars")
asyncio.run(main())
Rust
use kreuzberg::{extract_file, ExtractionConfig};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};
#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
let config = ExtractionConfig {
keywords: Some(KeywordConfig {
algorithm: KeywordAlgorithm::Yake,
max_keywords: 10,
min_score: 0.1,
ngram_range: (1, 3),
language: Some("en".to_string()),
..Default::default()
}),
..Default::default()
};
let result = extract_file("document.pdf", None::<&str>, &config).await?;
println!("Keywords: {:?}", result.keywords);
Ok(())
}
Go
package main
import (
"fmt"
"github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)
func main() {
config := &kreuzberg.ExtractionConfig{
Keywords: &kreuzberg.KeywordConfig{
Algorithm: "YAKE",
MaxKeywords: 10,
MinScore: 0.3,
NgramRange: "1,3",
Language: "en",
},
}
fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
config.Keywords.Algorithm,
config.Keywords.MaxKeywords,
config.Keywords.MinScore)
}
R
library(kreuzberg)
config <- extraction_config(
keywords = list(enabled = TRUE)
)
result <- extract_file_sync("document.pdf", "application/pdf", config)
cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
if (length(result$keywords) > 0) {
for (i in seq_len(min(5L, length(result$keywords)))) {
cat(sprintf(" - %s\n", result$keywords[[i]]))
}
}
YAKE Score Tuning¶
Lower YAKE scores = higher relevance. Use min_score as an upper bound:
min_score |
Effect |
|---|---|
0.5 |
Keeps most keywords |
0.3 |
Main topics only |
0.1 |
Core concepts only |
YakeParams.window_size controls co-occurrence context: 1–2 for narrow domains, 2–3 for general (default: 2), 3–4 for discussion-heavy content.
RAKE Score Tuning¶
Higher RAKE scores = higher relevance. Use min_score as a lower bound:
min_score |
Effect |
|---|---|
0.1 |
Keeps most keywords |
5.0 |
Main phrases only |
20.0 |
Only highly specific phrases |
RakeParams: min_word_length (default: 1), max_words_per_phrase (default: 3).
Troubleshooting¶
- Too few keywords — Lower
min_score, checkresult.contentis non-empty, setlanguageto match the document orNoneto disable stopword filtering - Too many irrelevant keywords — Raise
min_score, setlanguagefor stopword filtering, reducengram_rangeupper bound - Multi-word phrases missing (YAKE) — Switch to RAKE or confirm
ngram_rangeupper bound is >= 2 - Keywords don't match content — Verify text was extracted (
result.content) andlanguagematches the document
See the KeywordConfig reference for the full parameter list.