#!/usr/bin/python3 """Find Spanish letter-number rebuses. For example, “K0” is “Ka Cero”, which is (in Latin American Spanish) “casero”, “homemade”. This program finds other such words, such as KBCO, KCIK, ACIT, ATO, ZCO, QI2A, GOID, >IA, ": "mayor", "=": "igual", "@": "arroba", "$": "pesos", "%": "porsiento", '"': "komiyas", "×": "por", "/": "barra", "♣": "trebol", "☾": "luna", "☼": "sol", "Δ": "delta", "π": "pi", "θ": "teta", } pro_re_base = trieparse.to_trie(pronunciations.values()).to_re() pro_re_repeated = re.compile('(%s)+' % pro_re_base) inverted_pro = dict((v, k) for k, v in pronunciations.items()) letter_mappings = { 'v': 'b', 'w': 'u', 'z': 's', 'h': '', 'ñ': 'ni', 'x': 'ks', 'á': 'a', 'í': 'i', 'é': 'e', 'ó': 'o', 'ú': 'u', 'ý': 'y', 'ü': 'u', } def base_pronounce(word): "Generate pronunciation possibly including doubled phonemes. Omits stress." word = word.lower() skip = False vowels = set('aeiouáéíóúyýü') # for purposes of context high_vowels = set('ieyíéý') for i, c in enumerate(word): if skip: skip = False continue next = None if i == len(word) - 1 else word[i+1] next2 = None if i >= len(word) - 2 else word[i+2] if (i, c, next) == (0, 'p', 's'): # special case for psicología etc. continue if c in 'abdefijkmnoprstu': # letters with identity mapping yield c continue if c in letter_mappings: # non-identity simple letters yield from letter_mappings[c] continue # The remaining cases are letters with slightly inconsistent # pronunciations. Fortunately, Spanish orthography is fairly # phonetic, so a few simple rules suffice. if c == 'c': if next == 'h': yield 't' yield 'y' skip = True elif next in high_vowels: yield 's' # Latin American Spanish lacks ceceo else: yield 'k' continue if c == 'g': yield 'j' if next in high_vowels else 'g' skip = (next == 'u' and next2 in high_vowels) continue if c == 'q': yield 'k' skip = (next == 'u' and next2 in high_vowels) continue if c == 'l': if next == 'l': yield 'y' # ll/y merger is nearly universal nowadays skip = True else: yield 'l' continue if c == 'y': yield 'y' if next in vowels else 'i' continue raise Exception(c) def pronounce(word): last_phoneme = None for phoneme in base_pronounce(word): # Eliminate duplicates; Spanish arguably doesn’t have phonetic # gemination, so “innegable” is pronounced as “inegable”, # “cooperación” as “coperación”, and even “azahar” as “azár”. if phoneme == 'r' or phoneme != last_phoneme: yield phoneme last_phoneme = phoneme test_words = 'demas casero pedos'.split() def letterify(pronunciation): test = pro_re_repeated.match(pronunciation) if test is None or test.end() != len(pronunciation): return # matchobj.group only gives us the last text to match the given # group, sadly, so we do this goofy loop to find them all. This # doesn’t actually find any more words than the straightforward # approach of just pro_re.findall, but in theory it could. tokens = [] while pronunciation: token = pro_re_repeated.match(pronunciation).group(1) tokens.insert(0, token) pronunciation = pronunciation[:-len(token)] return ''.join(inverted_pro[token] for token in tokens) def inflect(word): yield word # Try the plurals. (For verbs, this is invalid, but creates few problems.) if word.endswith('z'): yield word[:-1] + "ces" elif word.endswith('s'): yield word + "es" else: yield word + "s" if __name__ == '__main__': for line in open('/usr/share/dict/spanish'): word = line.strip() for word in inflect(word): letters = letterify(''.join(pronounce(word))) if letters and len(letters) > 1: print(word, letters)