#!/usr/bin/python3
"""Leetcodish problem using a dynamic programming algorithm.
Not sure if ChatGPT invented this problem or directly copied it from
Leetcode or a similar site.
The problem is whether you can build a given string out of words from
a “dictionary” list by concatenating them (without spaces between,
which means that there’s potentially more than one way to produce a
given string).
The dictionary is guaranteed to contain at most 1000 words, which are
alphanumeric and nonempty, and the string is at most 300 characters.
I’m also assuming the dictionary words can’t be more than 300
characters.
"""
import re
def test_all(f, cruel=True):
test(f, True, "leetcode", ["leet", "code"])
test(f, False, "xleetcode", ["leet", "code"])
test(f, False, "lecodeet", ["code", "leet"])
test(f, False, "leetcodex", ["leet", "code"])
test(f, False, "catsandog", ["cats", "dog", "sand", "and", "cat"])
test(f, True, "catsanddog", ["cats", "dog", "sand", "and", "cat"])
test(f, True, "catsanddog", ["cats", "dog", "sand", "cat"])
test(f, True, "applepenapple", ["apple", "pen"])
test(f, False, "xapplepenapple", ["apple", "pen"])
# The above verifies that the code gives correct examples in some
# cases, but what if it’s inefficient? Let’s test its efficiency,
# at least a little.
# 988 possible words. Walk softly and carry a big dict.
big_dict = ['word' + c + c2 + c3
for c in 'abcdefghijklmnopqrstuvwxyz'
for c2 in 'abcdefghijklmnopqrs'
for c3 in 'ab'
]
big_string = 'kettles' * 50 + 'x' # 295 characters
test(f, False, big_string, big_dict)
test(f, False, big_string, big_dict + ['kettles'])
test(f, True, big_string, big_dict + ['kettles', 'x'])
# Here’s a much worse case for some algorithms.
cruel_dict = [c * n for c in 'abc' for n in range(1, 301)]
test(f, True, 'a' * 300, cruel_dict)
test(f, False, 'a' * 10 + 'x', cruel_dict)
if cruel:
test(f, False, 'a' * 299 + 'x', cruel_dict)
def test(f, expected, s, dictionary):
#print(f, s[:32] + ('' if len(s) <= 32 else '...'))
assert f(s, dictionary) == expected, (s, dictionary)
def wordBreak(s, dictionary):
# This is the first version I improvised off the cuff.
reached = [False] * (len(s) + 1) # Can we reach this position?
for word in dictionary: # Say, from the beginning of the string?
if s.startswith(word):
reached[len(word)] = True
# Now, we want to see if we can reach positions from previously
# reached positions.
for i in range(len(reached)):
for word in dictionary:
start = i - len(word)
if start >= 0 and reached[start] and s[start:i] == word:
reached[i] = True
#print(list(zip(s, reached)))
return reached[-1]
test_all(wordBreak)
def word_break_simpler(s, dictionary):
# This is a better version, reached by modifying the version
# above. As a bonus, it correctly returns True for the empty
# string. We start by noting that we can reach the beginning of
# the string.
reached = [True] + [False] * len(s)
for i in range(len(reached)):
if reached[i]:
# Obvious optimizations here would include, for example,
# only testing words that start with the right letter.
for word in dictionary:
if s[i:i + len(word)] == word:
reached[i + len(word)] = True
return reached[-1]
test_all(word_break_simpler)
def word_break_regexp(s, dictionary):
"""Solution using regular expressions.
With Python’s regular expression engine this takes exponential
time for some inputs; see the ``cruel`` test above for an example.
Even with the input ``'a' * 20 + 'x'`` it takes 12 seconds, 23
seconds with ``'a' * 21 + 'x'``, 43 seconds with ``'a' * 22 +
'x'`, 86 seconds with 23, etc.
"""
words = '|'.join(re.escape(word) for word in dictionary)
return True if re.match(f'(?:{words})*\\Z', s) else False
test_all(word_break_regexp, cruel=False)
def word_break_nfa(s, dictionary):
"""Solution by manually implementing regexps with a Thompson NFA.
On the first ``catsanddog`` example above, the one with the
dictionary ``["cats", "dog", "sand", "and", "cat"]``, it goes
through the following state sequence:
c {0}
a {1}
t {2}
s {0, 3} -- possibly a new word after “cat” or the “s” in “cats”
a {8, 0, 4} -- 8 is after “s” of “sand”, 4 is past the end of “cats”
n {9, 12} -- now we could be in either “sand” or “and”
d {10, 13}
d {0, 11, 14} -- now, a new word or off the end of “sand” or “and”
o {5}
g {6}
In a sense this is the “only testing words that start with the
right letter” approach taken to its logical extreme, to the point
that the dynamic-programming part isn’t even necessary any more.
is a good, short
introduction to the automata theory behind this approach (and
related approaches that are more general!). Compilers textbooks
usually also cover it, often at rather excessive length. I think
I first read about it in 01995 but haven't implemented it even
once in the last 28 years.
This is almost “solution by implementing a trie” or “by
implementing a DAWG”. The NFA is only barely nondeterministic: it
includes an ε-transition (unlabeled arrow) at the end of each word
back to the starting state, which is the accepting state.
So there are only ever at most two possible successor states for a
given (state, input) pair. But the NFA could at any given time be
in a superposition of many states, up to one per dictionary word.
So this algorithm can take O(MN) time for a dictionary of size M
and a string of size N. As you can verify by uncommenting the
``print`` statement, the ``cruel`` example above is such a
pathological case, so it takes 55 milliseconds. The
``word_break_regexp`` function above, on the other hand, would
take on the order of 2²⁸⁴ seconds, about 10⁶⁷ times longer than
the age of the universe.
Modifying this to generate a DAWG might be a good way to optimize
it.
"""
transitions = {}
state_counter = 0
for word in dictionary:
state = 0
for c in word:
succ = transitions.get((state, c))
if succ is None:
state_counter += 1
succ = transitions[state, c] = state_counter
state = succ
# We’re at the end of a word. Is there an ε-transition here yet?
if (state, None) not in transitions:
# If not, we add one back to the starting state.
transitions[state, None] = 0
# Keep the superposition of states in a Python “set” to ensure we
# don’t get duplicate states.
states = {0}
for c in s:
#print(c, states)
states = {succ
for succ in (transitions.get((state, c)) for state in states)
if succ is not None}
# Check for ε-transitions and, if present, add the states they
# point to. Copy the set of states first so we aren’t
# iterating over it as we modify it.
for state in list(states):
if (state, None) in transitions:
states.add(transitions[state, None])
# Accept iff one of the possible states is the initial state.
return (0 in states)
test_all(word_break_nfa)