#!/usr/bin/python3
"""Return the longest substring of an input string without repeated characters.

From a Leetcode problem, which specified that the maximum input string
length was 500_000 characters and that it would consist of "English
letters, numbers, and symbols"; not sure whether "symbols" is intended
to be qualified by "English" or whether it includes √ and ★, and
indeed whether those are "English" or not.  The Hypothesis tests here
are written to use any Unicode characters.

    >>> longunstr('asdfasdgha')
    'fasdgh'

There are three more algorithms that are supposed to do the same thing
more efficiently:

    >>> longunstr2('asdfasdgha')
    'fasdgh'
    >>> longunstr3('asdfasdgha')
    'fasdgh'
    >>> longunstr4('asdfasdgha')
    'fasdgh'

On my palmtop, for the 512-character string
'HHHHFFFkkPPCCCDDDPPSOJJJVVVaDDCCCbdAAAKKJJJIIIIIGGGPPDDDdYJJJYOOBBBHPPNNNHHHZXUUFFFQAAAPPPDDDSSCCCLLLTTNBBBLVjBBBLLLNUOOFFFTTNNNTTIIIJJBBBEKKEEEWDDDUPPPcSSEEEtMMMQONNKKKNNZZXXXVFFFMMbMMMLLOHCCYBBJEEZktAARPPhhNNahPPYLLNfXXXQQqWCCTTkhVVJIIEEGGigggUTMMMdDDLQQtiRROODDOhhiffmggjeecIIIWBBNQPCCQLLRRnRQQvAACVVfccOOBBldLFFJKsUUjeSNDDVVIIaaaacddFFqmSSbfMMHHRRcceDDrmlKKIITDDCBBDMaJJOJJLLWWdIIkBBRXXlXXVEEFmjMMTUffDDrrmXXmrdbUUgbSSdZZmLFFLbTICCCCPLLllSSVPPGGhhYFFAANfBBjhKKjbbsZZiggQJJUUhBBjIIZGGIIYbMMnnnnDDckjJJRYZVVVef'
the first version runs in 460ms, the second in 550μs, the third
in 230μs, and the fourth 160μs.

This suggests that the first version takes about 1.8μs·N², so at
500_000 bytes it would take about 440_000 seconds, about 121 hours.
The fourth version takes about 0.31μs·N, and thus about 160ms for that
same string of 500_000 characters.  But if there is a large number of
different characters, it seems to exceed this by typically about a
factor of 3, presumably due to reduction in memory locality.

This means that the fourth algorithm is at worst about a million times
faster at this input size and typically several million times faster
at that size.

I've tested all four algorithms against each other on 7100 randomly
generated strings using Hypothesis; they always produce the same
results.

"""
import doctest
import sys
import string

import hypothesis
from hypothesis import given, strategies


@hypothesis.settings(deadline=5000) # ms; the first algorithm is slow
@given(s=strategies.text(max_size=1024)
       | strategies.text(min_size=256, max_size=512) # ensure some long inputs
       | strategies.text(alphabet=string.ascii_letters, # ensure some repeats
                         min_size=256, max_size=512)
)
def test_equivalence(s):
    "Generative test for use with PyTest.  Run `py.test-3 longunstr.py`."
    result = longunstr(s)
    assert result == longunstr2(s)
    assert result == longunstr3(s)
    assert result == longunstr4(s)
    #open('tmp.test.out', 'a').write('passed for %r\n' % s)


def longunstr(s):
    """Straightforward O(N³) solution; just a Python statement of the problem.

    As mentioned above, this takes 460ms on a 512-byte example string.
    """
    return max((s[i:j] for i in range(len(s)+1)
                for j in range(i, len(s)+1)
                if len(set(s[i:j])) == j - i),
               key=len)

def longunstr2(s):
    """O(N) solution incrementally maintaining a set of seen chars.

    I claim that this is O(N) rather than O(N²) because there is some
    finite number M of possible distinct characters, and the inner
    loop runs at most M times, no matter how large N is, so O(MN) is
    in O(N) because M is a constant.  However, in my example of a
    string of 500_000 distinct Unicode code points, it's still
    indistinguishable from O(N²)!

    As mentioned above, this takes 550μs on a 512-byte example string.

    """
    best_start, best_end = 0, 0
    for i in range(len(s)):
        seen = set()
        for j in range(i, len(s)):
            if s[j] in seen:
                break
            seen.add(s[j])

            if j+1 - i > best_end - best_start:
                best_start, best_end = i, j+1

    return s[best_start:best_end]

def longunstr3(s):
    """Faster O(N) solution incrementally maintaining a set of seen chars.

    Here, instead of computing from scratch the longest substring
    without repeated characters for each starting position i, we
    incrementally compute the longest substring without repeated
    characters for each ending position j.  The idea is that it's the
    character at position j, preceded by some (possibly empty) suffix
    of the substring that ended at j-1.  So we scan j from left to
    right over the string, at each step moving i to the right toward
    it only as far as is necessary to eliminate duplicates.

    This examines each character in the string at most twice: once
    with i, once with j.

    As mentioned above, this takes 230μs on a 512-byte example string.

    """
    i = 0
    seen = set()
    best_start, best_end = 0, 0

    for j in range(len(s)):
        c = s[j]
        while c in seen:
            seen.remove(s[i])
            i += 1
            assert i <= j

        seen.add(c)

        if j+1 - i > best_end - best_start:
            best_start, best_end = i, j+1

    return s[best_start:best_end]


def longunstr4(s):
    """Faster O(N) solution, updating history in constant time.

    This is a tweak of the previous algorithm in which we keep track
    of the last seen *position* of each character, enabling us to leap
    the index i forward to after that position in constant time,
    instead of incrementally moving one by one.  We can treat
    last-seen-positions of less than i as meaning "absent", so we
    don't have to update those when we bump i.

    This examines each character in the input string exactly once.

    As mentioned above, this takes 160μs on a 512-byte example string.
    It took 170μs before I did the debatable get, put, and best_len
    optimizations.

    This does not blow up on strings of the size specified; on a
    string consisting of 500_000 repetitions of 'x', it takes 160ms,
    while on a string consisting of the first 500_000 Unicode code
    points, each occurring once, it takes 450ms.

    """
    i = 0                       # index of beginning of substring
    last = {}                   # most recent position of each char
    get = last.get              # cargo-cult optimization 
    put = last.__setitem__      # cargo-cult optimization
    # These are the start, end, and index difference of the longest
    # acceptable substring seen so far:
    best_start, best_end, best_diff = 0, 0, -1

    for j, c in enumerate(s):   # s[j] == c
        p = get(c, -1)          # When did we last see this c?
        if p >= i:              # If it's within the current substring,
            i = p + 1           # advance i so that it's not.

        put(c, j)               # Update `last` so last[s[j]] == j

        if j-i > best_diff:     # Is our new substring best?
            best_start, best_end, best_diff = i, j+1, j-i

    return s[best_start:best_end]


if '__main__' == __name__:
    doctest.testmod()
    for arg in sys.argv:
        print(arg, longunstr(arg), longunstr2(arg), longunstr3(arg),
              longunstr4(arg))