#!/usr/bin/python3
"""Find Spanish letter-number rebuses.

For example, “K0” is “Ka Cero”, which is (in Latin American Spanish)
“casero”, “homemade”.  This program finds other such words, such as
KBCO, KCIK, ACIT, ATO, ZCO, QI2A, GOID, >IA, <IA, C2O, 5P, CK, IDA,
+AG, and KGT.  It’s a little slow, taking 6.7 seconds on my laptop.

"""
import re

import trieparse

pronunciations = {
    "A": "a",
    "B": "be",
    "C": "se",
    "D": "de",
    "F": "efe",
    "G": "je",
    "H": "atye",
    "I": "i",
    "J": "jota",
    "K": "ka",
    "L": "ele",
    "M": "eme",
    "N": "ene",
    "Ñ": "enie",
    "O": "o",
    "P": "pe",
    "Q": "ku",
    "R": "erre",
    "S": "ese",
    "T": "te",
    "V": "ube",
    "W": "doblebe",
    "X": "ekis",
    "Y": "igriega",
    "Z": "seta",
    "0": "sero",
    "1": "uno",
    "2": "dos",
    "3": "tres",
    "4": "kuatro",
    "5": "sinko",
    "6": "seis",
    "7": "siete",
    "8": "otyo",
    "9": "nuebe",
    ",": "koma",
    ".": "punto",
    ";": "puntoikoma",
    ":": "dospuntos",
    "+": "mas",
    "-": "menos",
    "<": "menor",
    ">": "mayor",
    "=": "igual",
    "@": "arroba",
    "$": "pesos",
    "%": "porsiento",
    '"': "komiyas",
    "×": "por",
    "/": "barra",
    "♣": "trebol",
    "☾": "luna",
    "☼": "sol",
    "Δ": "delta",
    "π": "pi",
    "θ": "tita",
    "α": "alfa",
    "β": "beta",
    "Γ": "gama",
    "Ω": "omega",
    "Φ": "fi",
    "ι": "iota",
    "ψ": "psi",
    "♚": "rey",
    "♛": "dama",
    "°": "grado",
    "♌": "leo",
    "☛": "mano",
}

pro_re_base = trieparse.to_trie(pronunciations.values()).to_re()
pro_re_repeated = re.compile('(%s)+' % pro_re_base)
inverted_pro = dict((v, k) for k, v in pronunciations.items())

letter_mappings = {
    'v': 'b',
    'w': 'u',
    'z': 's',
    'h': '',
    'ñ': 'ni',
    'x': 'ks',
    'á': 'a',
    'í': 'i',
    'é': 'e',
    'ó': 'o',
    'ú': 'u',
    'ý': 'y',
    'ü': 'u',
}

def base_pronounce(word):
    "Generate pronunciation possibly including doubled phonemes.  Omits stress."
    word = word.lower()
    skip = False
    vowels = set('aeiouáéíóúyýü') # for purposes of context
    high_vowels = set('ieyíéý')

    for i, c in enumerate(word):
        if skip:
            skip = False
            continue

        next = None if i == len(word) - 1 else word[i+1]
        next2 = None if i >= len(word) - 2 else word[i+2]

        if (i, c, next) == (0, 'p', 's'):  # special case for psicología etc.
            continue
        
        if c in 'abdefijkmnoprstu':        # letters with identity mapping 
            yield c
            continue
        
        if c in letter_mappings:           # non-identity simple letters
            yield from letter_mappings[c]
            continue

        # The remaining cases are letters with slightly inconsistent
        # pronunciations.  Fortunately, Spanish orthography is fairly
        # phonetic, so a few simple rules suffice.

        if c == 'c':
            if next == 'h':
                yield 't'
                yield 'y'
                skip = True
            elif next in high_vowels:
                yield 's'  # Latin American Spanish lacks ceceo
            else:
                yield 'k'
            continue

        if c == 'g':
            yield 'j' if next in high_vowels else 'g'
            skip = (next == 'u' and next2 in high_vowels)
            continue

        if c == 'q':
            yield 'k'
            skip = (next == 'u' and next2 in high_vowels)
            continue

        if c == 'l':
            if next == 'l':
                yield 'y'  # ll/y merger is nearly universal nowadays
                skip = True
            else:
                yield 'l'
            continue

        if c == 'y':
            yield 'y' if next in vowels else 'i'
            continue

        raise Exception(c)

def pronounce(word):
    last_phoneme = None
    for phoneme in base_pronounce(word):
        # Eliminate duplicates; Spanish arguably doesn’t have phonetic
        # gemination, so “innegable” is pronounced as “inegable”,
        # “cooperación” as “coperación”, and even “azahar” as “azár”.
        if phoneme == 'r' or phoneme != last_phoneme:
            yield phoneme
        last_phoneme = phoneme

test_words = 'demas casero pedos'.split()

def letterify(pronunciation):
    test = pro_re_repeated.match(pronunciation)
    if test is None or test.end() != len(pronunciation):
        return

    # matchobj.group only gives us the last text to match the given
    # group, sadly, so we do this goofy loop to find them all.  This
    # doesn’t actually find any more words than the straightforward
    # approach of just pro_re.findall, but in theory it could.
    tokens = []
    while pronunciation:
        token = pro_re_repeated.match(pronunciation).group(1)
        tokens.insert(0, token)
        pronunciation = pronunciation[:-len(token)]

    return ''.join(inverted_pro[token] for token in tokens)

def inflect(word):
    yield word

    # Try the plurals.  (For verbs, this is invalid, but creates few problems.)
    if word.endswith('z'):
        yield word[:-1] + "ces"
    elif word.endswith('s'):
        yield word + "es"
    else:
        yield word + "s"

if __name__ == '__main__':
    for line in open('/usr/share/dict/spanish'):
        word = line.strip()
        for word in inflect(word):
            letters = letterify(''.join(pronounce(word)))
            if letters and len(letters) > 1:
                print(word, letters)