// Count occurrences of each word — an example program for a simple // trie implementation. // Applied to the 4,452,069-byte bible-pg10.txt from Project // Gutenberg, compiled with GCC 5.4.0-6ubuntu1~16.04.4 for i386 with // -O5, it generates the correct 579,687-byte output file in about // 330 ms, allocating 7,468,608 bytes according to valgrind (about // 12.9× the size of the output) and running 553,117,037 instructions // (about 124 per input byte). Compiled for amd64 it executes 16% // less instructions but takes 42% longer to run because it allocates // like twice as much memory and consequently has worse cache miss // rates. // To be somewhat more concrete, there were 34,057 distinct words // listed in the output file, stored in 109,714 allocated trie nodes // (14,929,024 bytes in the amd64 version). This works out to 3.22 // nodes per word. // (Hmm, maybe a Patricia or Patricia-ish version of this code, using // line endings, could beat GNU sort. It would use one node per word // instead of 3+. And it could mmap, avoiding copying under many // conditions.) // The Python version of this program takes 845 milliseconds, about // twice as long as this version. #include #include #include #include #include #define LENGTH(x) ((sizeof(x))/(sizeof(*x))) typedef struct trie { struct trie *kids[16]; int freq; } trie; static trie *new_trie() { trie *t = malloc(sizeof(*t)); if (!t) abort(); for (int i = 0; i < LENGTH(t->kids); i++) { t->kids[i] = NULL; } t->freq = 0; return t; } static trie *kid(trie *t, int i) { if (!t->kids[i]) t->kids[i] = new_trie(); return t->kids[i]; } static trie *blaze(trie *t, char *s, int n) { return n ? blaze(kid(kid(t, s[0] >> 4 & 0xf), s[0] & 0xf), s+1, n-1) : t; } static void increment(trie *t, char *word) { blaze(t, word, strlen(word))->freq++; } static void iterate_depth(trie *t, int depth, char *buf, void (*f)(void *userdata, char *word, int freq), void *userdata) { if (!t) return; if (t->freq) { buf[depth/2] = '\0'; f(userdata, buf, t->freq); } for (int i = 0; i < LENGTH(t->kids); i++) { buf[depth/2] = depth & 1 ? (buf[depth/2] & 0xf0) | i : i << 4; iterate_depth(t->kids[i], depth+1, buf, f, userdata); } } static void iterate(trie *t, void (*f)(void *userdata, char *word, int freq), void *userdata) { char buf[256] = {0}; iterate_depth(t, 0, buf, f, userdata); } static void print_freq(void *userdata, char *word, int freq) { printf("%8d %s\n", freq, word); } static void delete(trie *t) { if (!t) return; for (int i = 0; i < LENGTH(t->kids); i++) { delete(t->kids[i]); t->kids[i] = NULL; } free(t); } int main() { trie *t = new_trie(); char word[256]; int wordlen = 0, c; while ((c = getchar()) != EOF) { if (isspace(c)) { if (wordlen) { word[wordlen] = '\0'; increment(t, word); wordlen = 0; } } else { word[wordlen++] = c; assert(wordlen < sizeof(word)); } } iterate(t, print_freq, NULL); delete(t); return 0; }