#!/usr/bin/python3 """Generate an instance of Knuth’s problem 5–24. Usage: ./gen24.py [outputsize [dictfile [smoothing]]] The problem is to generate a sorted list from an unsorted list of predecessor-successor pairs, but the problem is framed as applying to “three million men with distinct names”. So I thought it would be fun to generate three million distinct names respecting (barely) English phonotactics. This program accomplishes that in about five minutes and about half a gigabyte of RAM, generating an output file of 50–64 megabytes. With 46 syllable onsets, 10 syllable nuclei, and 24 syllable codas, it can generate any of 11,040 possible syllables, giving 11,040 possible one-syllable names such as “Blug”, 121,881,600 two-syllable names such as “Sliphdrylg”, 1,345,572,864,000 three-syllable names such as “Nygslinsuz”, and 14,855,124,418,560,000 four-syllable names such as “Sprisshechsyspshest”. About 13% of the syllables generated are existing English words, according to /usr/share/dict/words, and about 3% are existing English names or chemical symbols. About 0.01% of two-syllable names are existing English words. Optionally, you can give it a dictionary or sample text file and a smoothing parameter to make its generated names look more like English or some other language. With /usr/share/dict/american-english .001, it generates names like “Nessnal” and “Atvery”; with smoothing 1 instead of .001, “Bristenmut” and “Ytcur”; and with smoothing of 100 or greater, its results are indistinguishable from a version without training data; with /usr/share/dict/spanish .001, it generates names like “Antrip” and “Ocri”, which are at least pronounceable in Spanish, if not normal names. """ import bisect import random import re import sys import trieparse def gen24(names): xs = list(names) random.shuffle(xs) # Avoid hash value order ys = [xs[i] + ' ' + xs[i+1] for i in range(len(xs)-1)] random.shuffle(ys) return ys class DiscreteDistribution: "Generate a weighted random choice from a discrete distribution." def __init__(self, weights, items=None): "weights need not be normalized." if items is None: items = range(len(weights)) self.items = items self.cdf = [] total = 0 for v in weights: total += v self.cdf.append(total) self.max = total def sample(self): return self.items[bisect.bisect_left(self.cdf, random.random() * self.max)] lengths = DiscreteDistribution(items=[ 1, 2, 3, 4], weights=[ 30, 100, 30, 1]) onsets = (['', 'qu', 'sh', 'ch', 'spl', 'th', 'wh'] + [C + 'r' for C in 'bcdfgkpt'] + ['s' + C + R for R in ('', 'r') for C in 'cpt'] + [C + 'l' for C in 'bcfgps'] + list('bcdfghjklmnprstvwzy')) vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'oo', 'ea', 'ai', 'ou'] codas = (['', 'ck', 'ss', 'nt', 'st', 'sp', 'lg', 'ph', 'ch', 'sh', 'ng'] + list('bdfglmnprstxz')) syllables = [O + V + C for O in onsets for V in vowels for C in codas] flat = DiscreteDistribution(weights=[1 for syllable in syllables], items=syllables) def names(n, dist=flat): rv = set() while len(rv) < n: rv.add(name(dist=dist)) return rv def name(dist=flat): return word(lengths.sample(), dist=dist).capitalize() def word(n, dist=flat): return ''.join(dist.sample() for i in range(n)) def train(wordlist, syllables=syllables, smoothing=1): freqs = {syl: smoothing for syl in syllables} # Laplace smoothing parser = re.compile(trieparse.to_trie(syllables).to_re()) for word in wordlist: for syllable in parser.findall(word.lower()): freqs[syllable] += 1 return [freqs[syllable] for syllable in syllables] if __name__ == '__main__': if len(sys.argv) > 1: n = int(sys.argv[1]) else: n = 3 * 10**6 if len(sys.argv) > 2: with open(sys.argv[2]) as words: dict = [w.strip() for w in words] smoothing = float(sys.argv[3]) if len(sys.argv) > 3 else 1 dist = DiscreteDistribution(weights=train(dict, syllables, smoothing), items=syllables) else: dist = flat for pair in gen24(names(n, dist)): print(pair)