#!/usr/bin/python
# -*- coding: utf-8 -*-
"""An example SQL database to show what those are good for.

This should be about 500 megabytes of line items, which should be
about 64 bytes, so that’s about 8 mebi-items.  If we somewhat
unrealistically distribute these equally 32 per invoice, which are
equally distributed 128 per customer, we need 2048 customers, giving
262144 invoices.

If we want to generate unique names for our customers, we need at
least 11 bits of entropy, and maybe as much as 22 if we want to keep
collisions minimal.  I think 14 bits should be plenty, which we can
achieve with 128 (uniformly random) first names and 128 last names.

Similarly we need 22 bits of randomness per address if we want them to
be probably collision-free, but that’s easier to achieve, among other
things because addresses include street numbers and ZIP codes.

Somewhat disappointingly, this takes 71 minutes to load the eight
million line items into Postgres.  Just generating a SQL script with
the data in it only takes 4'40", so the Python logic is not the
bottleneck here, though psql running with that SQL script for input
finishes in under 51' instead of 71', so the psycopg2 bindings may be
somewhat at fault here.

The LevelDB implementation is still profoundly suboptimal
for ad-hoc queries, but it
currently finishes inserting all the data in 9'45".
(I succumbed to the temptation to use LevelDB string keys as a
hierarchical database, and now it’s totally impractical to do any
queries that do a sequential scan on invoice records.  Fixing this
will involve reorganizing the data structure to be relational.)

This is still only inserting about 10,000 key-value pairs per second,
while leveldb-62m.cc on the same machine is able to insert about
300,000 (smaller and less interesting) key-value pairs per second.

The LevelDB implementation not only takes less time than Postgres but
also less disk space: 240MB rather than Postgres’s 600+.  As far as I
can tell, this is without Snappy (although I don’t know why Snappy is
not activated).

"""
from __future__ import division
import calendar
try:
    import cPickle as pickle
except ImportError:
    import pickle
import decimal
import json
import math
import marshal
import random
import string
import sys
import time

import psycopg2
try:
    import leveldb
except ImportError:
    pass                      # apparently leveldb was lost in Python3

try:
    unicode
except NameError:
    unicode = str


def main(argv):
    fast = False
    sql = False
    leveldb = False
    n_customers = 2048
    while True:
        if argv[1] == '-f':
            fast = True
        elif argv[1] == '-sql':
            sql = True
        elif argv[1] == '-leveldb':
            leveldb = True
        elif argv[1] == '-n':
            n_customers = int(argv[2])
            argv[2:3] = []
        elif argv[1].startswith('-'):
            raise InvalidArgument(argv[1])
        else:
            break

        argv[1:2] = []

    if sql:
        db = SqlScriptInvoiceDb(open(argv[1], 'wb'))
    elif leveldb:
        db = LevelDbInvoiceDb(argv[1])
    else:
        db = PostgresInvoiceDb('dbname=' + argv[1])

    if not fast:
        db.recreate()

        cust_ids = generate_customers(db, n_customers)
        products = generate_products(db, 1024)

        for jj, cust_id in enumerate(cust_ids):
            (name,) = db.get_cust_by_id(cust_id, ('name',))
            print("generating invoices for %s (%d)" % (name, cust_id))

            for ii in range(128):
                invoice_id = generate_invoice(db, cust_id)
                for jj in range(32):
                    generate_line_item(db, invoice_id, random.choice(products))

            if jj % 8 == 7:
                db.commit()

        db.commit()

    if not sql:
        display_invoice(db, 36)
        display_invoice(db, 937)
        display_invoice(db, 338)

class InvalidArgument(Exception): pass

def generate_customers(db, n):
    names = set()
    cust_ids = []
    for ii in range(n):
        while True:
            name = generate_name()
            if name not in names:
                names.add(name)
                break

        cust_ids.append(db.add_cust(name=name,
                                    company=generate_company(),
                                    streetaddress=generate_street_address(),
                                    city=random.choice(cities),
                                    state=random.choice(states),
                                    zip_='%05d' % random.randrange(1e5),
        ))

        # Keep the transactions from getting too big.
        if ii % 128 == 0:
            db.commit()

    db.commit()
    return cust_ids

# Note that this simple approach suffers from the Mohammed Wang
# problem.
def generate_name():
    return ' '.join([random.choice(forenames),
                     random.choice(surnames)])

# This simple approach starts generating collisions after a few dozen,
# but that should be okay for this example.
def generate_company():
    company_type = random.choice(['Enterprises', 'Group', 'Inc.', 'LLC', ''])
    surname = random.choice(surnames)
    base_name = random.choice([surname,
                               ''.join(random.choice(string.ascii_uppercase)
                                       for i in range(random.randrange(2, 5))),
                               '–'.join([surname, random.choice(surnames)]),
                               surname + ' & Associates',
                               surname + ' & Company',
                               '%s, %s and %s' % (surname,
                                                  random.choice(surnames),
                                                  random.choice(surnames)),
    ])
    base = random.choice([' '.join([base_name,
                                    random.choice(['Construction',
                                                   'Transportation',
                                                   'Couriers',
                                                   'Petroleum',
                                                   'Software',
                                                   'Consultants',
                                                   'Holdings',
                                                   'Engineering',
                                                   'Insurance']),
                                    company_type]),
                          ' '.join([base_name,
                                    company_type]),
                          ])
    return base

# This starts generating collisions after only ten or twenty items,
# but that’s okay.  <http://www.tin-can.org/members.htm> has some
# examples of what this kind of thing looks like.
def generate_street_address():
    # Most US cities have about 20 numbered streets.
    numbered = 1 + int(random.expovariate(1/19))
    # Most streets have addresses going up to about 5000.
    length = 100 + int(random.expovariate(1/4900))
    street_type = random.choice('St. Rd. Ave. Dr. Place Circle Ct.'.split())
    nstreet = (ordinal(1 + random.randrange(numbered)).title()
               + ' ' + random.choice(['St.', 'Ave.']))
    common = random.choice(common_street_names)

    street_category = random.random()
    street_name = (nstreet if street_category < .3 else
                   common + ' ' + street_type if street_category < .7 else
                   random.choice(surnames) + ' ' + street_type)

    if random.random() < .2:
        direction = random.choice('North South East West'.split())
        street_name = direction + ' ' + street_name

    number = random.randrange(1, length)
    return '%s %s' % (number, street_name)

def generate_products(db, n):
    products = []
    for ii in range(n):
        product = generate_product()
        # Consumer products like those listed here are usually between
        # US$5 and US$200, of which $31 is the geometric mean.
        # “Usually” might mean 2σ on a standard normal deviation.
        raw_price = random.lognormvariate(math.log(31), math.log(31/5)/2)
        price = round_to_sigfigs(3, raw_price)

        product_id = db.add_product(product, price)
        products.append((product_id, product, price))

        # Keep the transactions from getting too big.
        if ii % 128 == 0:
            db.commit()
        
    db.commit()                 # just in case 128 doesn’t divide n

    return products

def round_to_sigfigs(n, x):
    return round(x, int(n-math.floor(math.log10(x))-1))

def ok(a, b): assert a == b, (a, b)
ok(round_to_sigfigs(2, 34.345), 34.0)
ok(round_to_sigfigs(1, 34.345), 30.0)
ok(round_to_sigfigs(3, 34.345), 34.3)
ok(round_to_sigfigs(4, 34.344), 34.34)
ok(round_to_sigfigs(4, 134.344), 134.30)

def generate_invoice(db, cust_id):
    ago = random.randrange(365 * 24 * 60 * 60)
    status = ('unsent' if ago < 24 * 60 * 60 * 3 else
              'unpaid' if random.random() < .001 else
              'paid')
    due_epoch_seconds = time.time() - ago + 30 * 24 * 60 * 60
    due = time.strftime('%Y-%m-%d', time.gmtime(due_epoch_seconds))

    return db.add_invoice(cust_id, due, status)

def generate_line_item(db, invoice_id, product_row):
    productid, name, unitprice = product_row
    qty = int(1 + math.floor(random.expovariate(1)))
    db.add_line_item(qty, unitprice, productid, invoice_id)

header_template = """
INVOICE  %(status)s  Due %(due)s

%(name)s
%(company)s
%(streetaddress)s
%(city)s, %(state)s  %(zip)s

""".lstrip()

def display_invoice(db, invoice_id):
    (name, company, streetaddress, city, state, zip_, status, due
    ) = db.get_invoice_by_id(invoice_id)

    print((header_template % dict(name=name,
                                  company=company,
                                  streetaddress=streetaddress,
                                  city=city,
                                  state=state,
                                  zip=zip_,
                                  status='PAID' if status == 'paid' else '',
                                  due=due,
    )))

    names, prices, qties, totals = zip(*db.get_line_items(invoice_id))
    sum_totals = sum(totals)
    price_width = 9
    qty_width = 5
    widths = (2+len(max(names, key=len)), price_width, qty_width, price_width)
    prices = [('$' + str(price)).rjust(price_width-1) for price in prices]
    totals = [('$' + str(price)).rjust(price_width-1) for price in totals]
    qties = [str(qty).rjust(qty_width-1) for qty in qties]
    sum_totals = ('$' + str(sum_totals)).rjust(price_width-1)

    print_cols(widths, ('Product Name', 'Unit price', 'Quantity', 'Price'))
    print_cols(widths, ('', '', '', ''), pad='-', sep='+')
    for ii in range(len(names)):
        print_cols(widths, (' ' + names[ii], prices[ii], qties[ii], totals[ii]))
    print_cols(widths, ('', '', '', ''), pad='-', sep='+')
    print_cols(widths, ('%d items' % len(names), '', 'Total:', sum_totals),
               sep=' ')

def print_cols(widths, values, pad=' ', sep='|'):
    sfields = []
    for ii, (width, val) in enumerate(zip(widths, values)):
        if not isinstance(val, unicode):
            val = val.decode('utf-8')
        sfields.append(val.ljust(width, pad)[:width])
    print(sep.join(sfields).encode('utf-8'))

def ordinal(n):
    """Convert an int to an English ordinal.

    cf. <https://en.wikipedia.org/wiki/English_numerals#Ordinal_numbers>
    """
    assert n > 0
    if n < 20:
        return '''first second third fourth fifth
                  sixth seventh eighth ninth tenth
                  eleventh twelfth thirteenth fourteenth fifteenth
                  sixteenth seventeenth eighteenth nineteenth'''.split()[n-1]
    elif n < 100:
        tens = n//10
        tens_base = 'twen thir for fif six seven eigh nine'.split()[tens-2]
        if n % 10 == 0:
            return tens_base + 'tieth'
        else:
            return tens_base + 'ty-' + ordinal(n % 10)
    else:
        suffix = ('st' if n % 10 == 1 else
                  'nd' if n % 10 == 2 else
                  'rd' if n % 10 == 3 else
                  'th')
        return str(n) + suffix

ok(ordinal(1), 'first')
ok(ordinal(2), 'second')
ok(ordinal(9), 'ninth')
ok(ordinal(11), 'eleventh')
ok(ordinal(12), 'twelfth')
ok(ordinal(18), 'eighteenth')
ok(ordinal(20), 'twentieth')
ok(ordinal(21), 'twenty-first')
ok(ordinal(28), 'twenty-eighth')
ok(ordinal(37), 'thirty-seventh')
ok(ordinal(40), 'fortieth')
ok(ordinal(56), 'fifty-sixth')
ok(ordinal(99), 'ninety-ninth')
ok(ordinal(100), '100th')
ok(ordinal(101), '101st')
ok(ordinal(166), '166th')
ok(ordinal(172), '172nd')
ok(ordinal(183), '183rd')

# 128 surnames taken from
# <https://en.wikipedia.org/wiki/List_of_most_common_surnames_in_North_America>
surnames = """
Adams
Allen
Almonte
Anderson
Araya
Bailey
Baker
Barnes
Bell
Bennett
Brooks
Brown
Butler
Calderón
Campbell
Carter
Castillo
Clark
Collins
Cook
Cooper
Cox
Cruz
Davis
De la Cruz
De los Santos
Díaz
Edwards
Evans
Féliz
Fernández
Fisher
Flores
Foster
Gagnon
Garcia
Gómez
González
Gray
Green
Gutiérrez
Guzmán
Hall
Harris
Hernández
Herrera
Hill
Howard
Hughes
James
Jenkins
Johnson
Jones
Kelly
King
Lam
Lee
Lewis
Li
Long
López
Madrigal
Martin
Martinez
Mejía
Miller
Mitchell
Moore
Mora
Morales
Morgan
Morris
Murphy
Myers
Nelson
Nguyen
Núñez
Ortiz
Parker
Pérez
Perry
Peterson
Phillips
Polanco
Powell
Price
Ramírez
Reed
Reyes
Richardson
Rivera
Roberts
Robinson
Rodríguez
Rogers
Rojas
Rosario
Ross
Roy
Russell
Sánchez
Sanders
Santana
Santos
Scott
Segura
Smith
Stewart
Sullivan
Taylor
Thomas
Thompson
Torres
Tremblay
Turner
Valverde
Vargas
Vásquez
Villalobos
Walker
Ward
Watson
White
Williams
Wilson
Wood
Wright
Young
""".strip().split('\n')
assert len(surnames) == 128

# Names taken from
# <https://www.ssa.gov/oact/babynames/decades/century.html>
forenames = """
Aaron
Adam
Alan
Albert
Amanda
Amber
Amy
Andrea
Andrew
Ann
Anthony
Arthur
Austin
Benjamin
Beverly
Billy
Bobby
Brian
Brittany
Bryan
Carl
Carol
Carolyn
Catherine
Charles
Christian
Christina
Cynthia
Daniel
Danielle
Deborah
Debra
Dennis
Diana
Diane
Donna
Dorothy
Douglas
Dylan
Edward
Eric
Ethan
Eugene
Evelyn
Frances
Gabriel
George
Gloria
Grace
Gregory
Hannah
Harold
Heather
Helen
Howard
Jack
Jacob
Jacqueline
Janet
Janice
Jason
Jean
Jeffrey
Jennifer
Jeremy
Jerry
Joan
Johnny
Joshua
Joyce
Juan
Judith
Julie
Karen
Katherine
Kathy
Kelly
Kenneth
Kevin
Kyle
Larry
Lauren
Lawrence
Linda
Lisa
Louis
Maria
Marie
Marilyn
Mark
Martha
Mary
Matthew
Melissa
Michelle
Natalie
Nathan
Nicholas
Nicole
Noah
Olivia
Pamela
Patricia
Patrick
Paul
Rachel
Ralph
Rebecca
Rose
Roy
Russell
Ruth
Ryan
Samantha
Sarah
Sean
Shirley
Stephen
Steven
Susan
Teresa
Terry
Timothy
Tyler
Victoria
Walter
Wayne
Zachary
""".strip().split('\n')
assert len(forenames) == 128

# Common US city names taken from
# <https://en.wikipedia.org/wiki/List_of_the_most_common_U.S._place_names>
cities = """
Washington
Springfield
Franklin
Greenville
Bristol
Clinton
Fairview
Salem
Madison
Georgetown
Arlington
Ashland
Dover
Oxford
Jackson
Burlington
Manchester
Milton
Newport
Auburn
Centerville
Clayton
Dayton
Lexington
Milford
Mount Vernon
Oakland
Winchester
Cleveland
Hudson
Kingston
Riverside
""".strip().split('\n')

# US state abbreviations from
# <https://en.wikipedia.org/wiki/List_of_U.S._state_abbreviations>
states = """AL
AK
AZ
AR
CA
CO
CT
DE
FL
GA
HI
ID
IL
IN
IA
KS
LA
ME
MD
MI
MN
MS
MO
MT
NE
NV
NH
NJ
NM
NY
NC
ND
OH
OK
OR
RI
SC
SD
TN
TX
UT
VT
WA
WV
WI
WY
KY
MA
PA
VA
""".strip().split()
assert len(states) == 50

# Some names crudely retyped from
# <https://fivethirtyeight.com/datalab/whats-the-most-common-street-name-in-america/>
# and
# <http://www.usd116.org/profdev/ahtc/lessons/PlautFel09/scans/2009_07_09/StreetNamesCensus.pdf>.
common_street_names = """Main Park Oak Maple Pine Washington Cedar Elm Walnut Lake Sunset
Lincoln Jackson Church River Willow Jefferson Center Lakeview
Ridge Hickory Adams Cherry Highland Johnson Dogwood
Chestnut Spruce Wilson Meadow Forest Hill Madison Mill Railroad
""".strip().split()

def generate_product():
    if random.random() < .7:
        garment = random.choice(garments)
        color = random.choice(colors)
        fiber = random.choice(fibers)
        size = random.choice(sizes)
        adj = random.choice(adjectives)
        return ('%s %s %s %s, %s' % (color.capitalize(),
                                     fiber,
                                     adj,
                                     garment,
                                     size))
    elif random.random() < .5:
        return ' '.join([random.choice(adjectives),
                         random.choice(colors),
                         random.choice(hard_materials),
                         random.choice(toys)]).capitalize()
    else:
        return ' '.join([random.choice(colors),
                         random.choice(adjectives),
                         random.choice(hard_materials),
                         random.choice(appliances)]).capitalize()

colors = "white black red blue green yellow gray beige gold blue".split()
fibers = "cotton polyester spandex wool".split()
garments = "T-shirt boxers briefs pants blazer dress sweater".split()
adjectives = """ultrasonic intimate surgical dynamic utility
    biological personal daily electronic""".split()
sizes = """
child’s size
ages 2–5
ages 3–10
size S
size M
size L
size XL
""".strip().split("\n")
hard_materials = "plastic aluminum ceramic steel glass graphite".split()
toys = "car helicopter doll bicycle trike truck".split()
appliances = """
kettle
microwave
toaster oven
blender
toaster
juicer
skillet
refrigerator
rice cooker
pressure cooker
""".strip().split("\n")


class PostgresInvoiceDb:
    def __init__(self, dsn):
        self.db = psycopg2.connect(dsn)
        self.cur = self.db.cursor()

    ddl = b"""
    drop table if exists lineitem;
    drop table if exists invoice;
    drop table if exists product;
    drop table if exists cust;
    create table cust
           ( id serial primary key
           , name varchar not null
           , company varchar
           , streetaddress varchar
           , city varchar
           , state varchar
           , zip varchar
           );
    create table invoice
           ( id serial primary key
           , custid int references cust (id) not null
           , due timestamp
           , status varchar check (status in ('paid', 'unpaid', 'unsent'))
           );
    create table product
           ( id serial primary key
           , name varchar not null
           , unitprice decimal(8, 2) not null
           );
    create table lineitem
           ( qty int not null
           , unitprice decimal(8, 2) not null
           , productid int references product (id) not null
           , invoiceid int references invoice (id) not null
           );

    -- Without this index, finding the line items for an invoice takes a
    -- second or two.
    create index lineitem_invoiceid on lineitem (invoiceid);

    -- This index speeds up finding unpaid invoices by about a factor of
    -- 5.
    create index invoice_status on invoice (status);

    -- It would probably make sense to have an index on invoice by custid,
    -- too, but so far the gains are fairly minimal.

    """
    def recreate(self):
        self.cur.execute(self.ddl)
        self.commit()           # The default is to rollback!

    def commit(self):
        self.db.commit()

    def add_cust(self, name, company, streetaddress, city, state, zip_):
        self.cur.execute("""insert into cust
                            (name, company, streetaddress, city, state, zip)
                            values (%s, %s, %s, %s, %s, %s) returning id""",
                         (name, company, streetaddress, city, state, zip_))
        ((cust_id,),) = self.cur.fetchall()
        return cust_id

    def get_cust_by_id(self, cust_id, fields):
        query = "select %s from cust where id = %%s" % ', '.join(fields)
        self.cur.execute(query, (cust_id,))
        (row,) = self.cur.fetchall()
        return row

    def add_product(self, product, price):
        self.cur.execute("""insert into product (name, unitprice)
                            values (%s, %s)
                            returning id
                         """,
                         (product, price))
        ((product_id,),) = self.cur.fetchall()
        return product_id

    def add_invoice(self, cust_id, due, status):
        self.cur.execute("""insert into invoice (custid, due, status)
                            values (%s, %s, %s)
                            returning id""",
                         (cust_id, due, status))
        ((invoice_id,),) = self.cur.fetchall()
        return invoice_id

    def add_line_item(self, qty, unitprice, productid, invoice_id):
        self.cur.execute("""insert into lineitem
                            (qty, unitprice, productid, invoiceid)
                            values (%s, %s, %s, %s)""",
                         (qty, unitprice, productid, invoice_id))

    def get_invoice_by_id(self, invoice_id):
        self.cur.execute("""select
                              c.name, c.company, c.streetaddress, c.city, c.state, c.zip,
                              invoice.status, invoice.due
                            from cust as c
                            join invoice on invoice.custid = c.id
                            where invoice.id = %s
                         """, (invoice_id,))
        (row,) = self.cur.fetchall()
        return row

    def get_line_items(self, invoice_id):
        self.cur.execute("""select
                              product.name, lineitem.unitprice, qty,
                              lineitem.unitprice*qty
                            from lineitem
                            join product on lineitem.productid = product.id
                            where invoiceid = %s
                        """, (invoice_id,))
        return self.cur.fetchall()

def sql_escape(value):
    if isinstance(value, unicode):
        value = value.encode('utf-8')
    if isinstance(value, bytes):
        return b"'%s'" % value.replace(b"'", b"''")
    else:
        return str(value).encode('utf-8')

class SqlScriptInvoiceDb:
    def __init__(self, output_file_object):
        self.out = output_file_object
        self.max_id = {}
        self.last_max_id = {}
        self.cust = {}
        self.out.write(b'begin;\n')

    ddl = PostgresInvoiceDb.ddl

    def recreate(self):
        self.out.write(self.ddl)
        self.commit()

    def commit(self):
        self.flush_max_id_changes()
        self.out.write(b'commit;\n')
        self.out.write(b'begin;\n')

    def flush_max_id_changes(self):
        for table, new_max in self.max_id.items():
            if new_max == self.last_max_id.get(table, 1):
                continue

            self.out.write(b'select setval(%s, %d);\n'
                           % (sql_escape(b'%s_id_seq' % table.encode('utf-8')),
                                         new_max))
            self.last_max_id[table] = new_max

    def insert_id(self, table, fields, values):
        if table not in self.max_id:
            self.max_id[table] = 0

        self.max_id[table] += 1
        row_id = self.max_id[table]

        self.out.write(self.insert_statement(
            table,
            ('id',) + fields,
            (row_id,) + values))
        return row_id

    def insert_statement(self, table, fields, values):
        return b"insert into %s (%s) values (%s);\n" % (
            table.encode('utf-8'),
            b', '.join(field.encode('utf-8') for field in fields),
            b', '.join(map(sql_escape, values)),
        )

    cust_fields = tuple('name company streetaddress city state zip'.split())
    def add_cust(self, name, company, streetaddress, city, state, zip_):
        row = (name, company, streetaddress, city, state, zip_)
        row_id = self.insert_id('cust', self.cust_fields, row)
        self.cust[row_id] = row
        return row_id

    def get_cust_by_id(self, cust_id, fields):
        row = dict(zip(self.cust_fields, self.cust[cust_id]))
        return tuple(row[field] for field in fields)

    def add_product(self, product, price):
        return self.insert_id('product',
                              ('name', 'unitprice'),
                              (product, price))

    def add_invoice(self, cust_id, due, status):
        return self.insert_id('invoice',
                              ('custid', 'due', 'status'),
                              (cust_id, due, status))

    def add_line_item(self, qty, unitprice, productid, invoice_id):
        self.out.write(self.insert_statement(
            'lineitem',
            'qty unitprice productid invoiceid'.split(),
            (qty, unitprice, productid, invoice_id)))


class LevelDbInvoiceDb:
    """Store the invoice data in LevelDB with the same indexing as in Postgres.

    But now we are responsible for updating the indexing, and for that
    matter encoding the data and separating the tables, ourselves.
    Here’s the key schema we’re using.

    - `cust 12345` is customer id 12345.  The value is a JSON encoding
      of the list [name, company, streetaddress, city, state, zip],
      where zip is a number.

    - `cust id` is the next available customer id, as a decimal
      string.

    - `invoice 12345` is invoice id 12345.  The value is a JSON
      encoding of the list [custid, due, status].  `custid` is a
      number; `due` is a number of seconds since 1969 UTC (the Unix
      epoch); `status` is a string.

    - `invoice id` is the next available invoice id, as a decimal
      string.

    - `product 12345` is product id 12345.  The value is a JSON
      encoding of the list [name, unitprice].  `unitprice` is a
      number.

    - `product id` is the next available product id, as a decimal
      string.

    - `invoice 12345 item 3` is line item 3 of invoice 12345.  The
      value is a JSON encoding of the list [qty, unitprice,
      productid], which are all numbers.  The disadvantage of this key
      schema is that it makes it hard to fetch all the line items, for
      example to find all the line items for a given product; the
      invoices will be mixed in with them and need to be filtered out.
      Much worse, though, it means you can’t fetch all the invoices
      without fetching all the line items.  This is totally fatal
      because it means you need an index on invoice for any kind of
      fast query, like by customer id.  So this needs to change.

    - `invoice 12345 item id` is the next available line item id
      available within invoice 12345.

    - `invoice status="foo" 12345` is a key associated with an empty
      value which exists iff invoice 12345 has status "foo".  "foo" is
      a JSON representation of the status field from that invoice.

    Protocol Buffers or CapnProto or FlatBuffers or Thrift
    or probably even marshal would of
    course be faster and more compact than JSON, but JSON on my laptop
    still runs at tens of megabytes per second, which is a lot faster
    than I’m generating this test data or, in fact, than LevelDB can
    store it.

    Using different native-Python codecs in place of JSON for tuple
    encoding produced these results:

    marshal: 24% shorter total run time (32% higher speed)
    cPickle: 12% shorter total run time (14% higher speed)

    This implies that JSON is in fact responsible for at least 24% of
    the program’s runtime at this point, and probably closer to 30%.

    """

    codec = json                # or marshal, or cPickle
    def __init__(self, filename):
        # XXX why is this data not getting Snappied?
        self.db = leveldb.LevelDB(filename)
        self.batch = leveldb.WriteBatch()

        # .Get on the LevelDB won’t return things that are in our
        # current write batch.  So, to maintain read-our-writes
        # consistency within a transaction. we have to maintain our
        # own cache with at least things we haven’t yet written to the
        # database.
        self.cache = {}

        # To avoid the cost of deserializing things, we maintain a
        # cache of recently fetched rows in deserialized form; this is
        # simply managed by emptying it when it gets full, which is
        # much simpler than doing a better approximation of LRU and
        # performs well enough.
        self.row_cache = {}

    def recreate(self):
        for i, (k, v) in enumerate(self.db.RangeIter()):
            self.batch.Delete(k)
            if 0 == (i + 1) % 2048:
                self.commit()

        self.commit()

        for k, v in self.db.RangeIter():
            assert "database" == "not empty", (k, v)

        self.put('cust id', '1')
        self.put('invoice id', '1')
        self.put('product id', '1')
        self.commit()

    def put(self, k, v):
        self.batch.Put(k, v)
        self.cache[k] = v

    def get(self, k):
        if k in self.cache:
            return self.cache[k]
        return self.db.Get(k)

    def commit(self):
        self.db.Write(self.batch)
        self.batch = leveldb.WriteBatch()
        self.cache.clear()

    def add_cust(self, name, company, streetaddress, city, state, zip_):
        return self.add_row('cust',
                            [name, company, streetaddress, city, state, zip_,])

    cust_fields = tuple('name company streetaddress city state zip'.split())
    def get_cust_by_id(self, cust_id, fields):
        db_tuple = self.get_row('cust', cust_id)
        row = dict(zip(self.cust_fields, db_tuple))
        return tuple(row[field] for field in fields)

    def add_product(self, product, price):
        return self.add_row('product', [product, price])

    def add_invoice(self, cust_id, due, status):
        due = calendar.timegm(time.strptime(due, '%Y-%m-%d'))
        self.get_cust_by_id(cust_id, ()) # to validate foreign key constraint
        invoice_id = self.add_row('invoice', [cust_id, due, status])
        self.put('invoice %d item id' % invoice_id, '1')
        self.put('invoice status=%s %d' % (json.dumps(status), invoice_id), '')
        return invoice_id

    def add_line_item(self, qty, unitprice, productid, invoice_id):
        # Validate foreign key constraint.  This seems to be fairly
        # slow; with it this took about 15' before I added caching,
        # and with it commented out it only took about 9', while
        # caching has cut that back down to about 9'45".
        self.get_row('product', productid)

        # This will fail if the invoice doesn’t exist because the
        # counter won’t be initialized, thus implicitly checking the
        # other foreign key constraint.
        self.add_row('invoice %d item' % invoice_id,
                     [qty, unitprice, productid])

    def get_invoice_by_id(self, invoice_id):
        cust_id, due, status = self.get_row('invoice', invoice_id)
        due = time.strftime('%Y-%m-%d', time.gmtime(due))
        (name, company, streetaddress, city, state, zip_
        ) = self.get_row('cust', cust_id)
        return name, company, streetaddress, city, state, zip_, status, due

    def get_line_items(self, invoice_id):
        return list(self._get_line_items(invoice_id))

    def _get_line_items(self, invoice_id):
        self.commit()
        base_key = 'invoice %d item ' % invoice_id
        bkl = len(base_key)
        for k, v in self.db.RangeIter(base_key):
            if not k.startswith(base_key) or not '0' <= k[bkl:bkl+1] <= '9':
                return
            qty, unitprice, productid = self.codec.loads(v)
            unitprice = decimal.Decimal('%.2f' % unitprice)
            name, _ = self.get_row('product', productid)
            yield name, unitprice, qty, unitprice*qty

    def add_row(self, table, values):
        row_id = self.allocate_id(table)
        self.put('%s %d' % (table, row_id), self.codec.dumps(values))
        return row_id

    def get_row(self, table, row_id):
        try:
            return self.row_cache[table, row_id]
        except KeyError:
            row = self.codec.loads(self.get('%s %d' % (table, row_id)))
            if len(self.row_cache) > 16384:
                self.row_cache.clear()
            self.row_cache[table, row_id] = row
            return row

    def allocate_id(self, counter):
        key = '%s id' % counter
        new_id = int(self.get(key))
        self.put(key, str(new_id + 1))
        return new_id


class SqliteInvoiceDb:
    """Should I make a SQLite3 version too?

    The DB-API makes it very similar to Postgres, but to connect() you
    supply a filename rather than some name-value pairs.  It accepts
    but does not validate foreign key constraints.  It does validate
    unique constraints on columns introduced with `primary key` and
    also with `create unique index`; it also supports `not null`
    constraints.  It accepts `serial` as a column type but does not
    populate the column.  It supports `drop table if exists`, but its
    `begin` and `commit` statements do not allow the optional `work`.

    Since I already have the code for calculating the ids rather than
    relying on a `serial` column in the SqlScriptInvoiceDb, I can feed
    this quickly in:

    $ ./exampledb.py -sql -n 64 64cust.sql
    $ time grep -v 'select setval' 64cust.sql | sqlite3 64cust.sqlite3

    real	0m14.926s
    user	0m12.888s
    sys	0m0.488s

    Extrapolating, that’s 477 seconds, or just under 8 minutes, for
    the whole 2048-customer shebang.  Which makes it seem like it
    would be super worthwhile.  Indeed:

    $ time ./exampledb.py -sql >(grep -v 'select setval' | sqlite3 example.sqlite3)
    ...
    real	8m7.950s

    The resulting file is 321 mebibytes.

    The only real incompatibility with the Postgres implementation
    above is the `returning id` stuff.

    """

if __name__ == '__main__':
    main(sys.argv)