#!/usr/bin/python3
"""Generate an instance of Knuth’s problem 5–24.

Usage: ./gen24.py [outputsize [dictfile [smoothing]]]

The problem is to generate a sorted list from an unsorted list of
predecessor-successor pairs, but the problem is framed as applying to
“three million men with distinct names”.  So I thought it would be fun
to generate three million distinct names respecting (barely) English
phonotactics.  This program accomplishes that in about five minutes
and about half a gigabyte of RAM, generating an output file of 50–64
megabytes.

With 46 syllable onsets, 10 syllable nuclei, and 24 syllable codas, it
can generate any of 11,040 possible syllables, giving 11,040 possible
one-syllable names such as “Blug”, 121,881,600 two-syllable names such
as “Sliphdrylg”, 1,345,572,864,000 three-syllable names such as
“Nygslinsuz”, and 14,855,124,418,560,000 four-syllable names such as
“Sprisshechsyspshest”.

About 13% of the syllables generated are existing English words,
according to /usr/share/dict/words, and about 3% are existing English
names or chemical symbols.  About 0.01% of two-syllable names are
existing English words.

Optionally, you can give it a dictionary or sample text file and a
smoothing parameter to make its generated names look more like English
or some other language.  With /usr/share/dict/american-english .001,
it generates names like “Nessnal” and “Atvery”; with smoothing 1
instead of .001, “Bristenmut” and “Ytcur”; and with smoothing of 100
or greater, its results are indistinguishable from a version without
training data; with /usr/share/dict/spanish .001, it generates names
like “Antrip” and “Ocri”, which are at least pronounceable in Spanish,
if not normal names.

"""
import bisect
import random
import re
import sys

import trieparse


def gen24(names):
    xs = list(names)
    random.shuffle(xs)          # Avoid hash value order
    ys = [xs[i] + ' ' + xs[i+1] for i in range(len(xs)-1)]
    random.shuffle(ys)
    return ys

class DiscreteDistribution:
    "Generate a weighted random choice from a discrete distribution."
    def __init__(self, weights, items=None):
        "weights need not be normalized."
        if items is None:
            items = range(len(weights))
        self.items = items
        self.cdf = []
        total = 0
        for v in weights:
            total += v
            self.cdf.append(total)

        self.max = total

    def sample(self):
        return self.items[bisect.bisect_left(self.cdf,
                                             random.random() * self.max)]

lengths = DiscreteDistribution(items=[    1,   2,  3, 4],
                               weights=[ 30, 100, 30, 1])

onsets = (['', 'qu', 'sh', 'ch', 'spl', 'th', 'wh']
          + [C + 'r' for C in 'bcdfgkpt']
          + ['s' + C + R for R in ('', 'r') for C in 'cpt']
          + [C + 'l' for C in 'bcfgps']
          + list('bcdfghjklmnprstvwzy'))
vowels = ['a', 'e', 'i', 'o', 'u', 'y', 'oo', 'ea', 'ai', 'ou']
codas = (['', 'ck', 'ss', 'nt', 'st', 'sp', 'lg', 'ph', 'ch', 'sh', 'ng']
         + list('bdfglmnprstxz'))
syllables = [O + V + C for O in onsets for V in vowels for C in codas]

flat = DiscreteDistribution(weights=[1 for syllable in syllables],
                            items=syllables)

def names(n, dist=flat):
    rv = set()
    while len(rv) < n:
        rv.add(name(dist=dist))
    return rv

def name(dist=flat):
    return word(lengths.sample(), dist=dist).capitalize()

def word(n, dist=flat):
    return ''.join(dist.sample() for i in range(n))

def train(wordlist, syllables=syllables, smoothing=1):
    freqs = {syl: smoothing for syl in syllables}  # Laplace smoothing
    parser = re.compile(trieparse.to_trie(syllables).to_re())
    for word in wordlist:
        for syllable in parser.findall(word.lower()):
            freqs[syllable] += 1
    return [freqs[syllable] for syllable in syllables]

if __name__ == '__main__':
    if len(sys.argv) > 1:
        n = int(sys.argv[1])
    else:
        n = 3 * 10**6

    if len(sys.argv) > 2:
        with open(sys.argv[2]) as words:
            dict = [w.strip() for w in words]
        smoothing = float(sys.argv[3]) if len(sys.argv) > 3 else 1
        dist = DiscreteDistribution(weights=train(dict, syllables, smoothing),
                                    items=syllables)
    else:
        dist = flat

    for pair in gen24(names(n, dist)):
        print(pair)