Bentley's coding challenge: k most frequent words

C++ (a la Knuth)

I was curious how Knuth's program would fare, so I translated his (originally Pascal) program into C++.

Even though Knuth's primary goal was not speed but to illustrate his WEB system of literate programming, the program is surprisingly competitive, and leads to a faster solution than any of the answers here so far. Here's my translation of his program (the corresponding "section" numbers of the WEB program are mentioned in comments like "{§24}"):

#include <iostream>
#include <cassert>

// Adjust these parameters based on input size.
const int TRIE_SIZE = 800 * 1000; // Size of the hash table used for the trie.
const int ALPHA = 494441;  // An integer that's approximately (0.61803 * TRIE_SIZE), and relatively prime to T = TRIE_SIZE - 52.
const int kTolerance = TRIE_SIZE / 100;  // How many places to try, to find a new place for a "family" (=bunch of children).

typedef int32_t Pointer;  // [0..TRIE_SIZE), an index into the array of Nodes
typedef int8_t Char;  // We only care about 1..26 (plus two values), but there's no "int5_t".
typedef int32_t Count;  // The number of times a word has been encountered.
// These are 4 separate arrays in Knuth's implementation.
struct Node {
  Pointer link;  // From a parent node to its children's "header", or from a header back to parent.
  Pointer sibling;  // Previous sibling, cyclically. (From smallest child to header, and header to largest child.)
  Count count;  // The number of times this word has been encountered.
  Char ch;  // EMPTY, or 1..26, or HEADER. (For nodes with ch=EMPTY, the link/sibling/count fields mean nothing.)
} node[TRIE_SIZE + 1];
// Special values for `ch`: EMPTY (free, can insert child there) and HEADER (start of family).
const Char EMPTY = 0, HEADER = 27;

const Pointer T = TRIE_SIZE - 52;
Pointer x;  // The `n`th time we need a node, we'll start trying at x_n = (alpha * n) mod T. This holds current `x_n`.
// A header can only be in T (=TRIE_SIZE-52) positions namely [27..TRIE_SIZE-26].
// This transforms a "h" from range [0..T) to the above range namely [27..T+27).
Pointer rerange(Pointer n) {
  n = (n % T) + 27;
  // assert(27 <= n && n <= TRIE_SIZE - 26);
  return n;
}

// Convert trie node to string, by walking up the trie.
std::string word_for(Pointer p) {
  std::string word;
  while (p != 0) {
    Char c = node[p].ch;  // assert(1 <= c && c <= 26);
    word = static_cast<char>('a' - 1 + c) + word;
    // assert(node[p - c].ch == HEADER);
    p = (p - c) ? node[p - c].link : 0;
  }
  return word;
}

// Increment `x`, and declare `h` (the first position to try) and `last_h` (the last position to try). {§24}
#define PREPARE_X_H_LAST_H x = (x + ALPHA) % T; Pointer h = rerange(x); Pointer last_h = rerange(x + kTolerance);
// Increment `h`, being careful to account for `last_h` and wraparound. {§25}
#define INCR_H { if (h == last_h) { std::cerr << "Hit tolerance limit unfortunately" << std::endl; exit(1); } h = (h == TRIE_SIZE - 26) ? 27 : h + 1; }

// `p` has no children. Create `p`s family of children, with only child `c`. {§27}
Pointer create_child(Pointer p, int8_t c) {
  // Find `h` such that there's room for both header and child c.
  PREPARE_X_H_LAST_H;
  while (!(node[h].ch == EMPTY and node[h + c].ch == EMPTY)) INCR_H;
  // Now create the family, with header at h and child at h + c.
  node[h]     = {.link = p, .sibling = h + c, .count = 0, .ch = HEADER};
  node[h + c] = {.link = 0, .sibling = h,     .count = 0, .ch = c};
  node[p].link = h;
  return h + c;
}

// Move `p`'s family of children to a place where child `c` will also fit. {§29}
void move_family_for(const Pointer p, Char c) {
  // Part 1: Find such a place: need room for `c` and also all existing children. {§31}
  PREPARE_X_H_LAST_H;
  while (true) {
    INCR_H;
    if (node[h + c].ch != EMPTY) continue;
    Pointer r = node[p].link;
    int delta = h - r;  // We'd like to move each child by `delta`
    while (node[r + delta].ch == EMPTY and node[r].sibling != node[p].link) {
      r = node[r].sibling;
    }
    if (node[r + delta].ch == EMPTY) break;  // There's now space for everyone.
  }

  // Part 2: Now actually move the whole family to start at the new `h`.
  Pointer r = node[p].link;
  int delta = h - r;
  do {
    Pointer sibling = node[r].sibling;
    // Move node from current position (r) to new position (r + delta), and free up old position (r).
    node[r + delta] = {.ch = node[r].ch, .count = node[r].count, .link = node[r].link, .sibling = node[r].sibling + delta};
    if (node[r].link != 0) node[node[r].link].link = r + delta;
    node[r].ch = EMPTY;
    r = sibling;
  } while (node[r].ch != EMPTY);
}

// Advance `p` to its `c`th child. If necessary, add the child, or even move `p`'s family. {§21}
Pointer find_child(Pointer p, Char c) {
  // assert(1 <= c && c <= 26);
  if (p == 0) return c;  // Special case for first char.
  if (node[p].link == 0) return create_child(p, c);  // If `p` currently has *no* children.
  Pointer q = node[p].link + c;
  if (node[q].ch == c) return q;  // Easiest case: `p` already has a `c`th child.
  // Make sure we have room to insert a `c`th child for `p`, by moving its family if necessary.
  if (node[q].ch != EMPTY) {
    move_family_for(p, c);
    q = node[p].link + c;
  }
  // Insert child `c` into `p`'s family of children (at `q`), with correct siblings. {§28}
  Pointer h = node[p].link;
  while (node[h].sibling > q) h = node[h].sibling;
  node[q] = {.ch = c, .count = 0, .link = 0, .sibling = node[h].sibling};
  node[h].sibling = q;
  return q;
}

// Largest descendant. {§18}
Pointer last_suffix(Pointer p) {
  while (node[p].link != 0) p = node[node[p].link].sibling;
  return p;
}

// The largest count beyond which we'll put all words in the same (last) bucket.
// We do an insertion sort (potentially slow) in last bucket, so increase this if the program takes a long time to walk trie.
const int MAX_BUCKET = 10000;
Pointer sorted[MAX_BUCKET + 1];  // The head of each list.

// Records the count `n` of `p`, by inserting `p` in the list that starts at `sorted[n]`.
// Overwrites the value of node[p].sibling (uses the field to mean its successor in the `sorted` list).
void record_count(Pointer p) {
  // assert(node[p].ch != HEADER);
  // assert(node[p].ch != EMPTY);
  Count f = node[p].count;
  if (f == 0) return;
  if (f < MAX_BUCKET) {
    // Insert at head of list.
    node[p].sibling = sorted[f];
    sorted[f] = p;
  } else {
    Pointer r = sorted[MAX_BUCKET];
    if (node[p].count >= node[r].count) {
      // Insert at head of list
      node[p].sibling = r;
      sorted[MAX_BUCKET] = p;
    } else {
      // Find right place by count. This step can be SLOW if there are too many words with count >= MAX_BUCKET
      while (node[p].count < node[node[r].sibling].count) r = node[r].sibling;
      node[p].sibling = node[r].sibling;
      node[r].sibling = p;
    }
  }
}

// Walk the trie, going over all words in reverse-alphabetical order. {§37}
// Calls "record_count" for each word found.
void walk_trie() {
  // assert(node[0].ch == HEADER);
  Pointer p = node[0].sibling;
  while (p != 0) {
    Pointer q = node[p].sibling;  // Saving this, as `record_count(p)` will overwrite it.
    record_count(p);
    // Move down to last descendant of `q` if any, else up to parent of `q`.
    p = (node[q].ch == HEADER) ? node[q].link : last_suffix(q);
  }
}

int main(int, char** argv) {
  // Program startup
  std::ios::sync_with_stdio(false);

  // Set initial values {§19}
  for (Char i = 1; i <= 26; ++i) node[i] = {.ch = i, .count = 0, .link = 0, .sibling = i - 1};
  node[0] = {.ch = HEADER, .count = 0, .link = 0, .sibling = 26};

  // read in file contents
  FILE *fptr = fopen(argv[1], "rb");
  fseek(fptr, 0L, SEEK_END);
  long dataLength = ftell(fptr);
  rewind(fptr);
  char* data = (char*)malloc(dataLength);
  fread(data, 1, dataLength, fptr);
  if (fptr) fclose(fptr);

  // Loop over file contents: the bulk of the time is spent here.
  Pointer p = 0;
  for (int i = 0; i < dataLength; ++i) {
    Char c = (data[i] | 32) - 'a' + 1;  // 1 to 26, for 'a' to 'z' or 'A' to 'Z'
    if (1 <= c && c <= 26) {
      p = find_child(p, c);
    } else {
      ++node[p].count;
      p = 0;
    }
  }
  node[0].count = 0;

  walk_trie();

  const int max_words_to_print = atoi(argv[2]);
  int num_printed = 0;
  for (Count f = MAX_BUCKET; f >= 0 && num_printed <= max_words_to_print; --f) {
    for (Pointer p = sorted[f]; p != 0 && num_printed < max_words_to_print; p = node[p].sibling) {
      std::cout << word_for(p) << " " << node[p].count << std::endl;
      ++num_printed;
    }
  }

  return 0;
}

Differences from Knuth's program:

I combined Knuth's 4 arrays link, sibling, count and ch into an array of a struct Node (find it easier to understand this way).
I changed the literate-programming (WEB-style) textual transclusion of sections into more conventional function calls (and a couple of macros).
We don't need to use standard Pascal's weird I/O conventions/restrictions, so using fread and data[i] | 32 - 'a' as in the other answers here, instead of the Pascal workaround.
In case we exceed limits (run out of space) while the program is running, Knuth's original program deals with it gracefully by dropping later words, and printing a message at the end. (It's not quite right to say that McIlroy "criticized Knuth's solution as not even able to process a full text of the Bible"; he was only pointing out that sometimes frequent words may occur very late in a text, such as the word "Jesus" in the Bible, so the error condition is not innocuous.) I've taken the noisier (and anyway easier) approach of simply terminating the program.
The program declares a constant TRIE_SIZE to control the memory usage, which I bumped up. (The constant of 32767 had been chosen for the original requirements -- "a user should be able to find the 100 most frequent words in a twenty-page technical paper (roughly a 50K byte file)" and because Pascal deals well with ranged integer types and packs them optimally. We had to increase it 25x to 800,000 as test input is now 20 million times larger.)
For the final printing of strings, we can just walk the trie and do a dumb (possibly even quadratic) string append.

Apart from that, this is pretty much exactly Knuth's program (using his hash trie / packed trie data structure and bucket sort), and does pretty much the same operations (as Knuth's Pascal program would) while looping through all characters in the input; note that it uses no external algorithm or data structure libraries, and also that words of equal frequency will be printed in alphabetical order.

Timing

Compiled with

clang++ -std=c++17 -O2 ptrie-walktrie.cc

When run on the largest testcase here (giganovel with 100,000 words requested), and compared against the fastest program posted here so far, I find it slightly but consistently faster:

target/release/frequent:   4.809 ±   0.263 [ 4.45.. 5.62]        [... 4.63 ...  4.75 ...  4.88...]
ptrie-walktrie:            4.547 ±   0.164 [ 4.35.. 4.99]        [... 4.42 ...   4.5 ...  4.68...]

(The top line is Anders Kaseorg's Rust solution; the bottom is the above program. These are timings from 100 runs, with mean, min, max, median, and quartiles.)

Analysis

Why is this faster? It is not that C++ is faster than Rust, or that Knuth's program is the fastest possible -- in fact, Knuth's program is slower on insertions (as he mentions) because of the trie-packing (to conserve memory). The reason, I suspect, is related to something that Knuth complained about in 2008:

A Flame About 64-bit Pointers

It is absolutely idiotic to have 64-bit pointers when I compile a program that uses less than 4 gigabytes of RAM. When such pointer values appear inside a struct, they not only waste half the memory, they effectively throw away half of the cache.

The program above uses 32-bit array indices (not 64-bit pointers), so the "Node" struct occupies less memory, so there are more Nodes on the stack and fewer cache misses. (In fact, there was some work on this as the x32 ABI, but it seems to be not in a good state even though the idea is obviously useful, e.g. see the recent announcement of pointer compression in V8. Oh well.) So on giganovel, this program uses 12.8 MB for the (packed) trie, versus the Rust program's 32.18MB for its trie (on giganovel). We could scale up 1000x (from "giganovel" to "teranovel" say) and still not exceed 32-bit indices, so this seems a reasonable choice.

Faster variant

We can optimize for speed and forego the packing, so we can actually use the (non-packed) trie as in the Rust solution, with indices instead of pointers. This gives something that's faster and has no pre-fixed limits on number of distinct words, characters etc:

#include <iostream>
#include <cassert>
#include <vector>
#include <algorithm>

typedef int32_t Pointer;  // [0..node.size()), an index into the array of Nodes
typedef int32_t Count;
typedef int8_t Char;  // We'll usually just have 1 to 26.
struct Node {
  Pointer link;  // From a parent node to its children's "header", or from a header back to parent.
  Count count;  // The number of times this word has been encountered. Undefined for header nodes.
};
std::vector<Node> node; // Our "arena" for Node allocation.

std::string word_for(Pointer p) {
  std::vector<char> drow;  // The word backwards
  while (p != 0) {
    Char c = p % 27;
    drow.push_back('a' - 1 + c);
    p = (p - c) ? node[p - c].link : 0;
  }
  return std::string(drow.rbegin(), drow.rend());
}

// `p` has no children. Create `p`s family of children, with only child `c`.
Pointer create_child(Pointer p, Char c) {
  Pointer h = node.size();
  node.resize(node.size() + 27);
  node[h] = {.link = p, .count = -1};
  node[p].link = h;
  return h + c;
}

// Advance `p` to its `c`th child. If necessary, add the child.
Pointer find_child(Pointer p, Char c) {
  assert(1 <= c && c <= 26);
  if (p == 0) return c;  // Special case for first char.
  if (node[p].link == 0) return create_child(p, c);  // Case 1: `p` currently has *no* children.
  return node[p].link + c;  // Case 2 (easiest case): Already have the child c.
}

int main(int, char** argv) {
  auto start_c = std::clock();

  // Program startup
  std::ios::sync_with_stdio(false);

  // read in file contents
  FILE *fptr = fopen(argv[1], "rb");
  fseek(fptr, 0, SEEK_END);
  long dataLength = ftell(fptr);
  rewind(fptr);
  char* data = (char*)malloc(dataLength);
  fread(data, 1, dataLength, fptr);
  fclose(fptr);

  node.reserve(dataLength / 600);  // Heuristic based on test data. OK to be wrong.
  node.push_back({0, 0});
  for (Char i = 1; i <= 26; ++i) node.push_back({0, 0});

  // Loop over file contents: the bulk of the time is spent here.
  Pointer p = 0;
  for (long i = 0; i < dataLength; ++i) {
    Char c = (data[i] | 32) - 'a' + 1;  // 1 to 26, for 'a' to 'z' or 'A' to 'Z'
    if (1 <= c && c <= 26) {
      p = find_child(p, c);
    } else {
      ++node[p].count;
      p = 0;
    }
  }
  ++node[p].count;
  node[0].count = 0;

  // Brute-force: Accumulate all words and their counts, then sort by frequency and print.
  std::vector<std::pair<int, std::string>> counts_words;
  for (Pointer i = 1; i < static_cast<Pointer>(node.size()); ++i) {
    int count = node[i].count;
    if (count == 0 || i % 27 == 0) continue;
    counts_words.push_back({count, word_for(i)});
  }
  auto cmp = [](auto x, auto y) {
    if (x.first != y.first) return x.first > y.first;
    return x.second < y.second;
  };
  std::sort(counts_words.begin(), counts_words.end(), cmp);
  const int max_words_to_print = std::min<int>(counts_words.size(), atoi(argv[2]));
  for (int i = 0; i < max_words_to_print; ++i) {
    auto [count, word] = counts_words[i];
    std::cout << word << " " << count << std::endl;
  }

  return 0;
}

This program, despite doing something a lot dumber for sorting than the solutions here, uses (for giganovel) only 12.2MB for its trie, and manages to be faster. Timings of this program (last line), compared with the earlier timings mentioned:

target/release/frequent:   4.809 ±   0.263 [ 4.45.. 5.62]        [... 4.63 ...  4.75 ...  4.88...]
ptrie-walktrie:            4.547 ±   0.164 [ 4.35.. 4.99]        [... 4.42 ...   4.5 ...  4.68...]
itrie-nolimit:             3.907 ±   0.127 [ 3.69.. 4.23]        [... 3.81 ...   3.9 ...   4.0...]

I'd be eager to see what this (or the hash-trie program) would like if translated into Rust. :-)

Further details

About the data structure used here: an explanation of "packing" tries is given tersely in Exercise 4 of Section 6.3 (Digital Searching, i.e. tries) in Volume 3 of TAOCP, and also in the thesis of Knuth's student Frank Liang about hyphenation in TeX: Word Hy-phen-a-tion by Com-put-er.
The context of Bentley's columns, Knuth's program, and McIlroy's review (only a small part of which was about the Unix philosophy) is clearer in light of previous and later columns, and Knuth's previous experience including compilers, TAOCP, and TeX.
There's an entire book Exercises in Programming Style, showing different approaches to this particular program, etc.

I have an unfinished blog post elaborating on the points above; might edit this answer when it's done. Meanwhile, posting this answer here anyway, on the occasion (Jan 10) of Knuth's birthday. :-)

Rust

On my computer, this runs giganovel 100000 about 42% faster (10.64 s vs. 18.24 s) than Moogie’s C “prefix tree + bins” C solution. Also it has no predefined limits (unlike the C solution which predefines limits on word length, unique words, repeated words, etc.).

`src/main.rs`

use memmap::MmapOptions;
use pdqselect::select_by_key;
use std::cmp::Reverse;
use std::default::Default;
use std::env::args;
use std::fs::File;
use std::io::{self, Write};
use typed_arena::Arena;

#[derive(Default)]
struct Trie<'a> {
    nodes: [Option<&'a mut Trie<'a>>; 26],
    count: u64,
}

fn main() -> io::Result<()> {
    // Parse arguments
    let mut args = args();
    args.next().unwrap();
    let filename = args.next().unwrap();
    let size = args.next().unwrap().parse().unwrap();

    // Open input
    let file = File::open(filename)?;
    let mmap = unsafe { MmapOptions::new().map(&file)? };

    // Build trie
    let arena = Arena::new();
    let mut num_words = 0;
    let mut root = Trie::default();
    {
        let mut node = &mut root;
        for byte in &mmap[..] {
            let letter = (byte | 32).wrapping_sub(b'a');
            if let Some(child) = node.nodes.get_mut(letter as usize) {
                node = child.get_or_insert_with(|| {
                    num_words += 1;
                    arena.alloc(Default::default())
                });
            } else {
                node.count += 1;
                node = &mut root;
            }
        }
        node.count += 1;
    }

    // Extract all counts
    let mut index = 0;
    let mut counts = Vec::with_capacity(num_words);
    let mut stack = vec![root.nodes.iter()];
    'a: while let Some(frame) = stack.last_mut() {
        while let Some(child) = frame.next() {
            if let Some(child) = child {
                if child.count != 0 {
                    counts.push((child.count, index));
                    index += 1;
                }
                stack.push(child.nodes.iter());
                continue 'a;
            }
        }
        stack.pop();
    }

    // Find frequent counts
    select_by_key(&mut counts, size, |&(count, _)| Reverse(count));
    // Or, in nightly Rust:
    //counts.partition_at_index_by_key(size, |&(count, _)| Reverse(count));

    // Extract frequent words
    let size = size.min(counts.len());
    counts[0..size].sort_by_key(|&(_, index)| index);
    let mut out = Vec::with_capacity(size);
    let mut it = counts[0..size].iter();
    if let Some(mut next) = it.next() {
        index = 0;
        stack.push(root.nodes.iter());
        let mut word = vec![b'a' - 1];
        'b: while let Some(frame) = stack.last_mut() {
            while let Some(child) = frame.next() {
                *word.last_mut().unwrap() += 1;
                if let Some(child) = child {
                    if child.count != 0 {
                        if index == next.1 {
                            out.push((word.to_vec(), next.0));
                            if let Some(next1) = it.next() {
                                next = next1;
                            } else {
                                break 'b;
                            }
                        }
                        index += 1;
                    }
                    stack.push(child.nodes.iter());
                    word.push(b'a' - 1);
                    continue 'b;
                }
            }
            stack.pop();
            word.pop();
        }
    }
    out.sort_by_key(|&(_, count)| Reverse(count));

    // Print results
    let stdout = io::stdout();
    let mut stdout = io::BufWriter::new(stdout.lock());
    for (word, count) in out {
        stdout.write_all(&word)?;
        writeln!(stdout, " {}", count)?;
    }

    Ok(())
}

`Cargo.toml`

[package]
name = "frequent"
version = "0.1.0"
authors = ["Anders Kaseorg <[email protected]>"]
edition = "2018"

[dependencies]
memmap = "0.7.0"
typed-arena = "1.4.1"
pdqselect = "0.1.0"

[profile.release]
lto = true
opt-level = 3

Usage

cargo build --release
time target/release/frequent ulysses64 10

[Rust] Optimized version of test9753's code

I changed a few details (e.g. pre-processing the data and splitting up the parent links from child links) while also making the code a bit more idiomatic.

On my machine this runs the giganovel about 20% faster.

use std::env;
use std::io::{Error, ErrorKind};

type Pointer = i32;
type Count = i32;
type Char = u8;

#[derive(Copy, Clone)]
struct Node {
    link: Pointer,
    count: Count,
}

fn word_for(parents: &[Pointer], mut p: Pointer) -> String {
    let mut drow = Vec::with_capacity(25); // sane max length
    while p != -1 {
        let c = p % 26;
        p = parents[(p / 26) as usize];
        drow.push(b'a' + (c as Char));
    }
    drow.reverse();
    String::from_utf8(drow).unwrap()
}

fn create_child(
    parents: &mut Vec<Pointer>,
    children: &mut Vec<Node>,
    p: Pointer,
    c: Char,
) -> Pointer {
    let h = children.len();
    children[p as usize].link = h as Pointer;
    children.resize(children.len() + 26, Node { link: -1, count: 0 });
    parents.push(p);
    (h as u32 + c as u32) as Pointer
}

fn find_child(
    parents: &mut Vec<Pointer>,
    children: &mut Vec<Node>,
    p: Pointer,
    c: Char,
) -> Pointer {
    let elem = children[p as usize];
    if elem.link == -1 {
        create_child(parents, children, p, c)
    } else {
        (elem.link as u32 + c as u32) as Pointer
    }
}

fn error(msg: &str) -> Error {
    Error::new(ErrorKind::Other, msg)
}

fn main() -> std::io::Result<()> {
    let args = env::args().collect::<Vec<String>>();
    if args.len() != 3 {
        return Err(error(&format!(
            "Usage: {} file-path limit-num",
            env::args().next().unwrap()
        )));
    }
    let mut data: Vec<u8> = std::fs::read(&args[1])?;
    for d in &mut data {
        *d = (*d | 32) - b'a';
    }

    let mut children: Vec<Node> = Vec::with_capacity(data.len() / 600);
    let mut parents: Vec<Pointer> = Vec::with_capacity(data.len() / 600 / 26);
    parents.push(-1);
    for _ in 0..26 {
        children.push(Node { link: -1, count: 0 });
    }

    let mut data = data.iter();

    while let Some(&c) = data.next() {
        if c < 26 {
            let mut p = c as Pointer;

            while let Some(&c) = data.next() {
                if c < 26 {
                    p = find_child(&mut parents, &mut children, p, c);
                } else {
                    break;
                }
            }
            children[p as usize].count += 1;
        }
    }

    let mut counts_words: Vec<(i32, String)> = Vec::new();
    for (i, e) in children.iter().enumerate() {
        if e.count != 0 {
            counts_words.push((-e.count, word_for(&parents, i as Pointer)));
        }
    }

    counts_words.sort();

    let limit = args[2]
        .parse()
        .map_err(|_| error("ARGV[2] must be in range: [1, usize_max]"))?;

    for (count, word) in counts_words.iter().take(limit) {
        println!("{word} {count}", word = word, count = -count);
    }

    Ok(())
}