#!/usr/bin/python3 """Leetcodish problem using a dynamic programming algorithm. Not sure if ChatGPT invented this problem or directly copied it from Leetcode or a similar site. The problem is whether you can build a given string out of words from a “dictionary” list by concatenating them (without spaces between, which means that there’s potentially more than one way to produce a given string). The dictionary is guaranteed to contain at most 1000 words, which are alphanumeric and nonempty, and the string is at most 300 characters. I’m also assuming the dictionary words can’t be more than 300 characters. """ import re def test_all(f, cruel=True): test(f, True, "leetcode", ["leet", "code"]) test(f, False, "xleetcode", ["leet", "code"]) test(f, False, "lecodeet", ["code", "leet"]) test(f, False, "leetcodex", ["leet", "code"]) test(f, False, "catsandog", ["cats", "dog", "sand", "and", "cat"]) test(f, True, "catsanddog", ["cats", "dog", "sand", "and", "cat"]) test(f, True, "catsanddog", ["cats", "dog", "sand", "cat"]) test(f, True, "applepenapple", ["apple", "pen"]) test(f, False, "xapplepenapple", ["apple", "pen"]) # The above verifies that the code gives correct examples in some # cases, but what if it’s inefficient? Let’s test its efficiency, # at least a little. # 988 possible words. Walk softly and carry a big dict. big_dict = ['word' + c + c2 + c3 for c in 'abcdefghijklmnopqrstuvwxyz' for c2 in 'abcdefghijklmnopqrs' for c3 in 'ab' ] big_string = 'kettles' * 50 + 'x' # 295 characters test(f, False, big_string, big_dict) test(f, False, big_string, big_dict + ['kettles']) test(f, True, big_string, big_dict + ['kettles', 'x']) # Here’s a much worse case for some algorithms. cruel_dict = [c * n for c in 'abc' for n in range(1, 301)] test(f, True, 'a' * 300, cruel_dict) test(f, False, 'a' * 10 + 'x', cruel_dict) if cruel: test(f, False, 'a' * 299 + 'x', cruel_dict) def test(f, expected, s, dictionary): #print(f, s[:32] + ('' if len(s) <= 32 else '...')) assert f(s, dictionary) == expected, (s, dictionary) def wordBreak(s, dictionary): # This is the first version I improvised off the cuff. reached = [False] * (len(s) + 1) # Can we reach this position? for word in dictionary: # Say, from the beginning of the string? if s.startswith(word): reached[len(word)] = True # Now, we want to see if we can reach positions from previously # reached positions. for i in range(len(reached)): for word in dictionary: start = i - len(word) if start >= 0 and reached[start] and s[start:i] == word: reached[i] = True #print(list(zip(s, reached))) return reached[-1] test_all(wordBreak) def word_break_simpler(s, dictionary): # This is a better version, reached by modifying the version # above. As a bonus, it correctly returns True for the empty # string. We start by noting that we can reach the beginning of # the string. reached = [True] + [False] * len(s) for i in range(len(reached)): if reached[i]: # Obvious optimizations here would include, for example, # only testing words that start with the right letter. for word in dictionary: if s[i:i + len(word)] == word: reached[i + len(word)] = True return reached[-1] test_all(word_break_simpler) def word_break_regexp(s, dictionary): """Solution using regular expressions. With Python’s regular expression engine this takes exponential time for some inputs; see the ``cruel`` test above for an example. Even with the input ``'a' * 20 + 'x'`` it takes 12 seconds, 23 seconds with ``'a' * 21 + 'x'``, 43 seconds with ``'a' * 22 + 'x'`, 86 seconds with 23, etc. """ words = '|'.join(re.escape(word) for word in dictionary) return True if re.match(f'(?:{words})*\\Z', s) else False test_all(word_break_regexp, cruel=False) def word_break_nfa(s, dictionary): """Solution by manually implementing regexps with a Thompson NFA. On the first ``catsanddog`` example above, the one with the dictionary ``["cats", "dog", "sand", "and", "cat"]``, it goes through the following state sequence: c {0} a {1} t {2} s {0, 3} -- possibly a new word after “cat” or the “s” in “cats” a {8, 0, 4} -- 8 is after “s” of “sand”, 4 is past the end of “cats” n {9, 12} -- now we could be in either “sand” or “and” d {10, 13} d {0, 11, 14} -- now, a new word or off the end of “sand” or “and” o {5} g {6} In a sense this is the “only testing words that start with the right letter” approach taken to its logical extreme, to the point that the dynamic-programming part isn’t even necessary any more. is a good, short introduction to the automata theory behind this approach (and related approaches that are more general!). Compilers textbooks usually also cover it, often at rather excessive length. I think I first read about it in 01995 but haven't implemented it even once in the last 28 years. This is almost “solution by implementing a trie” or “by implementing a DAWG”. The NFA is only barely nondeterministic: it includes an ε-transition (unlabeled arrow) at the end of each word back to the starting state, which is the accepting state. So there are only ever at most two possible successor states for a given (state, input) pair. But the NFA could at any given time be in a superposition of many states, up to one per dictionary word. So this algorithm can take O(MN) time for a dictionary of size M and a string of size N. As you can verify by uncommenting the ``print`` statement, the ``cruel`` example above is such a pathological case, so it takes 55 milliseconds. The ``word_break_regexp`` function above, on the other hand, would take on the order of 2²⁸⁴ seconds, about 10⁶⁷ times longer than the age of the universe. Modifying this to generate a DAWG might be a good way to optimize it. """ transitions = {} state_counter = 0 for word in dictionary: state = 0 for c in word: succ = transitions.get((state, c)) if succ is None: state_counter += 1 succ = transitions[state, c] = state_counter state = succ # We’re at the end of a word. Is there an ε-transition here yet? if (state, None) not in transitions: # If not, we add one back to the starting state. transitions[state, None] = 0 # Keep the superposition of states in a Python “set” to ensure we # don’t get duplicate states. states = {0} for c in s: #print(c, states) states = {succ for succ in (transitions.get((state, c)) for state in states) if succ is not None} # Check for ε-transitions and, if present, add the states they # point to. Copy the set of states first so we aren’t # iterating over it as we modify it. for state in list(states): if (state, None) in transitions: states.add(transitions[state, None]) # Accept iff one of the possible states is the initial state. return (0 in states) test_all(word_break_nfa)