Skip to content

Keyword Extraction

Extract ranked keywords from document text using YAKE or RAKE algorithms.

Algorithm Scoring Best for
YAKE Lower score = more relevant (0.0–1.0) General documents, single terms, multilingual
RAKE Higher score = more relevant (unbounded) Multi-word phrases, technical docs

Quick Start

Python
import asyncio
from kreuzberg import extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        keywords=KeywordConfig(
            algorithm=KeywordAlgorithm.YAKE,
            max_keywords=10,
            min_score=0.3
        )
    )
    result = await extract_file("research_paper.pdf", config=config)

    keywords: list = result.extracted_keywords or []
    for kw in keywords:
        score: float = kw.score or 0.0
        text: str = kw.text or ""
        print(f"{text}: {score:.3f}")

asyncio.run(main())
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    keywords: {
        algorithm: 'yake',
        maxKeywords: 10,
        minScore: 0.3,
    },
};

const result = await extractFile('research_paper.pdf', null, config);
console.log(`Content length: ${result.content.length}`);
console.log(`Metadata: ${JSON.stringify(result.metadata)}`);
Rust
use kreuzberg::{extract_file, ExtractionConfig, KeywordConfig, KeywordAlgorithm};

let config = ExtractionConfig {
    keywords: Some(KeywordConfig {
        algorithm: KeywordAlgorithm::Yake,
        max_keywords: 10,
        min_score: 0.3,
        ..Default::default()
    }),
    ..Default::default()
};

let result = extract_file("research_paper.pdf", None, &config).await?;

if let Some(keywords) = &result.extracted_keywords {
    println!("Keywords: {:?}", keywords);
}
Go
package main

import (
    "fmt"
    "log"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    config := &kreuzberg.ExtractionConfig{
        Keywords: &kreuzberg.KeywordConfig{
            Algorithm:   "YAKE",
            MaxKeywords: 10,
            MinScore:    0.3,
        },
    }

    result, err := kreuzberg.ExtractFileSync("research_paper.pdf", config)
    if err != nil {
        log.Fatalf("extract failed: %v", err)
    }

    if keywords, ok := result.Metadata.Additional["keywords"]; ok {
        fmt.Printf("Keywords: %v\n", keywords)
    }
}
Java
// Note: Keyword extraction is not yet available in Java bindings
// This feature requires the 'keywords' feature flag and is planned for a future release
C#
using Kreuzberg;
using System.Collections.Generic;

var config = new ExtractionConfig
{
    Keywords = new KeywordConfig
    {
        Algorithm = KeywordAlgorithm.Yake,
        MaxKeywords = 10,
        MinScore = 0.3
    }
};

var result = await KreuzbergClient.ExtractFileAsync(
    "research_paper.pdf",
    config
);

if (result.Metadata.ContainsKey("keywords"))
{
    var keywords = (List<Dictionary<string, object>>)result.Metadata["keywords"];
    foreach (var kw in keywords)
    {
        var text = (string)kw["text"];
        var score = (double)kw["score"];
        Console.WriteLine($"{text}: {score:F3}");
    }
}
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  keywords: Kreuzberg::Config::Keywords.new(
    algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
    max_keywords: 10,
    min_score: 0.3
  )
)

result = Kreuzberg.extract_file_sync('research_paper.pdf', config: config)

keywords = result.extracted_keywords
keywords.each do |kw|
  puts "#{kw['text']}: #{kw['score'].round(3)}"
end

Keywords are returned in result.extracted_keywords as objects with text and score fields.

Configuration

Parameter Type Default Description
algorithm KeywordAlgorithm YAKE YAKE or RAKE
max_keywords int 10 Maximum keywords to extract
min_score float 0.0 Score threshold (upper bound for YAKE, lower bound for RAKE)
ngram_range tuple[int, int] (1, 3) Min and max phrase length in words
language str \| None "en" Language for stopword filtering (None disables)
yake_params YakeParams YAKE-specific tuning
rake_params RakeParams RAKE-specific tuning
Python
import asyncio
from kreuzberg import (
    ExtractionConfig,
    KeywordConfig,
    KeywordAlgorithm,
    extract_file,
)

async def main() -> None:
    config: ExtractionConfig = ExtractionConfig(
        keywords=KeywordConfig(
            algorithm=KeywordAlgorithm.YAKE,
            max_keywords=10,
            min_score=0.3,
            ngram_range=(1, 3),
            language="en"
        )
    )
    result = await extract_file("document.pdf", config=config)
    print(f"Content extracted: {len(result.content)} chars")

asyncio.run(main())
TypeScript
import { extractFile } from '@kreuzberg/node';

const config = {
    keywords: {
        algorithm: 'yake',
        maxKeywords: 10,
        minScore: 0.3,
        ngramRange: [1, 3],
        language: 'en',
    },
};

const result = await extractFile('document.pdf', null, config);
console.log(`Content: ${result.content}`);
Rust
use kreuzberg::{extract_file, ExtractionConfig};
use kreuzberg::keywords::{KeywordConfig, KeywordAlgorithm};

#[tokio::main]
async fn main() -> kreuzberg::Result<()> {
    let config = ExtractionConfig {
        keywords: Some(KeywordConfig {
            algorithm: KeywordAlgorithm::Yake,
            max_keywords: 10,
            min_score: 0.1,
            ngram_range: (1, 3),
            language: Some("en".to_string()),
            ..Default::default()
        }),
        ..Default::default()
    };

    let result = extract_file("document.pdf", None::<&str>, &config).await?;
    println!("Keywords: {:?}", result.keywords);
    Ok(())
}
Go
package main

import (
    "fmt"

    "github.com/kreuzberg-dev/kreuzberg/packages/go/v4"
)

func main() {
    config := &kreuzberg.ExtractionConfig{
        Keywords: &kreuzberg.KeywordConfig{
            Algorithm:  "YAKE",
            MaxKeywords: 10,
            MinScore:   0.3,
            NgramRange: "1,3",
            Language:   "en",
        },
    }

    fmt.Printf("Keywords config: Algorithm=%s, MaxKeywords=%d, MinScore=%f\n",
        config.Keywords.Algorithm,
        config.Keywords.MaxKeywords,
        config.Keywords.MinScore)
}
Ruby
require 'kreuzberg'

config = Kreuzberg::Config::Extraction.new(
  keywords: Kreuzberg::Config::Keywords.new(
    algorithm: Kreuzberg::KeywordAlgorithm::YAKE,
    max_keywords: 10,
    min_score: 0.3,
    ngram_range: [1, 3],
    language: 'en'
  )
)
R
library(kreuzberg)

config <- extraction_config(
  keywords = list(enabled = TRUE)
)

result <- extract_file_sync("document.pdf", "application/pdf", config)

cat(sprintf("Extracted %d keywords\n", length(result$keywords)))
if (length(result$keywords) > 0) {
  for (i in seq_len(min(5L, length(result$keywords)))) {
    cat(sprintf("  - %s\n", result$keywords[[i]]))
  }
}
C#
using Kreuzberg;

var config = new ExtractionConfig
{
    Keywords = new KeywordConfig
    {
        Algorithm = KeywordAlgorithm.Yake,
        MaxKeywords = 10,
        MinScore = 0.3,
        NgramRange = (1, 3),
        Language = "en"
    }
};

YAKE Score Tuning

Lower YAKE scores = higher relevance. Use min_score as an upper bound:

min_score Effect
0.5 Keeps most keywords
0.3 Main topics only
0.1 Core concepts only

YakeParams.window_size controls co-occurrence context: 1–2 for narrow domains, 2–3 for general (default: 2), 3–4 for discussion-heavy content.

RAKE Score Tuning

Higher RAKE scores = higher relevance. Use min_score as a lower bound:

min_score Effect
0.1 Keeps most keywords
5.0 Main phrases only
20.0 Only highly specific phrases

RakeParams: min_word_length (default: 1), max_words_per_phrase (default: 3).

Troubleshooting

  • Too few keywords — Lower min_score, check result.content is non-empty, set language to match the document or None to disable stopword filtering
  • Too many irrelevant keywords — Raise min_score, set language for stopword filtering, reduce ngram_range upper bound
  • Multi-word phrases missing (YAKE) — Switch to RAKE or confirm ngram_range upper bound is >= 2
  • Keywords don't match content — Verify text was extracted (result.content) and language matches the document

See the KeywordConfig reference for the full parameter list.