// written in ed because cellphone keyboards suck at vi // linear probing version to see if // better locality improves things // On my 5,805,447-word RFC corpus, this code // processes about one word per 290ns on my // cellphone. About 140ns of that is just splitting // the input into words. The separate-chaining // version in justhash.c is more like 330ns, so // this is modestly faster, but requires much // more code to rehash the table when full. // The hash header `table` points to a separate // array of `node`s so we can rehash when it gets // too full and allocate a larger array. The // state where the array pointer `a` is null is // used to represent an empty table, which will // get a 16-entry array on first insert, so the // zero-filled state provided by globals, .bss, // calloc, or ={0} is valid. Rehashing grows the // array exponentially to guarantee amortized // constant time. // The hash function is djb2 tweaked by a final // multiplication because linear probing doesn't // deal well with clustered hashes. I also use // a different seed because I forget the standard // one. // The xstrdup and xmalloc functions aped from // libiberty abort on allocation failure, which is // fine for this purposr. #include #include #include #include static inline void *xmalloc(size_t n) { void *p = malloc(n); if (!p) abort(); return p; } static inline char *xstrdup(char *s) { char *p = strdup(s); if (!p) abort(); return p; } void xfree(void *p) { free(p); } typedef struct node { char *s; uint32_t hash; int count; } node; typedef struct table { node *a; size_t size, items; } table; table words; uint32_t hash(char *s) { uint32_t h = 1234; while (*s) h = h * 33 + *s++; return h * 0x1011; } void rehash(table *t) { size_t n = 0, m = 0x10; if (t->a) n = t->size, m = n * 4; size_t s = m * sizeof(node); node *a = xmalloc(s); memset(a, 0, s); for (node *p = t->a; p != t->a + n; p++) { if (!p->s) continue; size_t j = p->hash & m-1; while (a[j].s) j = j+1 & m-1; a[j] = *p; } xfree(t->a); t->a = a; t->size = m; } node *upsert(table *t, char *s) { if (t->items >= t->size*3/4) rehash(t); uint32_t h = hash(s); int i = h & t->size - 1; for (; t->a[i].s; i = i + 1 & t->size - 1) { if (0 == strcmp(s, t->a[i].s)) return &t->a[i]; } t->items++; t->a[i] = (node) { xstrdup(s), h }; return &t->a[i]; } void print_words(table *t, int *total_ptr) { int total = 0; for (size_t i = 0; i != t->size; i++) { node *n = &t->a[i]; if (!n->s) continue; printf("%8d %s\n", n->count, n->s); total += n->count; } *total_ptr = total; } int max_chain(table *t) { int max = 0, len = 0; for (size_t i = 0; i != t->size; i++) { len = t->a[i].s ? len + 1 : 0; if (len > max) max = len; // XXX handle wraparound } return max; } #ifdef NO_HASHING #define upsert(...) (&dummy) #endif node dummy; int main(int argc, char **argv) { char buf[128]; for (;;) { if (!fgets(buf, sizeof buf, stdin)) break; char *p = buf; for (char *q = buf; *q; q++) { char c = *q; if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') { if (!*p) p = q; continue; } *q = 0; if (strlen(p)) upsert(&words, p)->count++; p = q; } if (strlen(p)) upsert(&words, p)->count++; } int total; print_words(&words, &total); printf("dummy count %d", dummy.count); fprintf(stderr, "max chain %d, total words %d\n", max_chain(&words), total); return 0; }