#!/usr/bin/python # -*- coding: utf-8 -*- """An example SQL database to show what those are good for. This should be about 500 megabytes of line items, which should be about 64 bytes, so that’s about 8 mebi-items. If we somewhat unrealistically distribute these equally 32 per invoice, which are equally distributed 128 per customer, we need 2048 customers, giving 262144 invoices. If we want to generate unique names for our customers, we need at least 11 bits of entropy, and maybe as much as 22 if we want to keep collisions minimal. I think 14 bits should be plenty, which we can achieve with 128 (uniformly random) first names and 128 last names. Similarly we need 22 bits of randomness per address if we want them to be probably collision-free, but that’s easier to achieve, among other things because addresses include street numbers and ZIP codes. Somewhat disappointingly, this takes 71 minutes to load the eight million line items into Postgres. Just generating a SQL script with the data in it only takes 4'40", so the Python logic is not the bottleneck here, though psql running with that SQL script for input finishes in under 51' instead of 71', so the psycopg2 bindings may be somewhat at fault here. The LevelDB implementation is still profoundly suboptimal for ad-hoc queries, but it currently finishes inserting all the data in 9'45". (I succumbed to the temptation to use LevelDB string keys as a hierarchical database, and now it’s totally impractical to do any queries that do a sequential scan on invoice records. Fixing this will involve reorganizing the data structure to be relational.) This is still only inserting about 10,000 key-value pairs per second, while leveldb-62m.cc on the same machine is able to insert about 300,000 (smaller and less interesting) key-value pairs per second. The LevelDB implementation not only takes less time than Postgres but also less disk space: 240MB rather than Postgres’s 600+. As far as I can tell, this is without Snappy (although I don’t know why Snappy is not activated). """ from __future__ import division import calendar try: import cPickle as pickle except ImportError: import pickle import decimal import json import math import marshal import random import string import sys import time import psycopg2 try: import leveldb except ImportError: pass # apparently leveldb was lost in Python3 try: unicode except NameError: unicode = str def main(argv): fast = False sql = False leveldb = False n_customers = 2048 while True: if argv[1] == '-f': fast = True elif argv[1] == '-sql': sql = True elif argv[1] == '-leveldb': leveldb = True elif argv[1] == '-n': n_customers = int(argv[2]) argv[2:3] = [] elif argv[1].startswith('-'): raise InvalidArgument(argv[1]) else: break argv[1:2] = [] if sql: db = SqlScriptInvoiceDb(open(argv[1], 'wb')) elif leveldb: db = LevelDbInvoiceDb(argv[1]) else: db = PostgresInvoiceDb('dbname=' + argv[1]) if not fast: db.recreate() cust_ids = generate_customers(db, n_customers) products = generate_products(db, 1024) for jj, cust_id in enumerate(cust_ids): (name,) = db.get_cust_by_id(cust_id, ('name',)) print("generating invoices for %s (%d)" % (name, cust_id)) for ii in range(128): invoice_id = generate_invoice(db, cust_id) for jj in range(32): generate_line_item(db, invoice_id, random.choice(products)) if jj % 8 == 7: db.commit() db.commit() if not sql: display_invoice(db, 36) display_invoice(db, 937) display_invoice(db, 338) class InvalidArgument(Exception): pass def generate_customers(db, n): names = set() cust_ids = [] for ii in range(n): while True: name = generate_name() if name not in names: names.add(name) break cust_ids.append(db.add_cust(name=name, company=generate_company(), streetaddress=generate_street_address(), city=random.choice(cities), state=random.choice(states), zip_='%05d' % random.randrange(1e5), )) # Keep the transactions from getting too big. if ii % 128 == 0: db.commit() db.commit() return cust_ids # Note that this simple approach suffers from the Mohammed Wang # problem. def generate_name(): return ' '.join([random.choice(forenames), random.choice(surnames)]) # This simple approach starts generating collisions after a few dozen, # but that should be okay for this example. def generate_company(): company_type = random.choice(['Enterprises', 'Group', 'Inc.', 'LLC', '']) surname = random.choice(surnames) base_name = random.choice([surname, ''.join(random.choice(string.ascii_uppercase) for i in range(random.randrange(2, 5))), '–'.join([surname, random.choice(surnames)]), surname + ' & Associates', surname + ' & Company', '%s, %s and %s' % (surname, random.choice(surnames), random.choice(surnames)), ]) base = random.choice([' '.join([base_name, random.choice(['Construction', 'Transportation', 'Couriers', 'Petroleum', 'Software', 'Consultants', 'Holdings', 'Engineering', 'Insurance']), company_type]), ' '.join([base_name, company_type]), ]) return base # This starts generating collisions after only ten or twenty items, # but that’s okay. has some # examples of what this kind of thing looks like. def generate_street_address(): # Most US cities have about 20 numbered streets. numbered = 1 + int(random.expovariate(1/19)) # Most streets have addresses going up to about 5000. length = 100 + int(random.expovariate(1/4900)) street_type = random.choice('St. Rd. Ave. Dr. Place Circle Ct.'.split()) nstreet = (ordinal(1 + random.randrange(numbered)).title() + ' ' + random.choice(['St.', 'Ave.'])) common = random.choice(common_street_names) street_category = random.random() street_name = (nstreet if street_category < .3 else common + ' ' + street_type if street_category < .7 else random.choice(surnames) + ' ' + street_type) if random.random() < .2: direction = random.choice('North South East West'.split()) street_name = direction + ' ' + street_name number = random.randrange(1, length) return '%s %s' % (number, street_name) def generate_products(db, n): products = [] for ii in range(n): product = generate_product() # Consumer products like those listed here are usually between # US$5 and US$200, of which $31 is the geometric mean. # “Usually” might mean 2σ on a standard normal deviation. raw_price = random.lognormvariate(math.log(31), math.log(31/5)/2) price = round_to_sigfigs(3, raw_price) product_id = db.add_product(product, price) products.append((product_id, product, price)) # Keep the transactions from getting too big. if ii % 128 == 0: db.commit() db.commit() # just in case 128 doesn’t divide n return products def round_to_sigfigs(n, x): return round(x, int(n-math.floor(math.log10(x))-1)) def ok(a, b): assert a == b, (a, b) ok(round_to_sigfigs(2, 34.345), 34.0) ok(round_to_sigfigs(1, 34.345), 30.0) ok(round_to_sigfigs(3, 34.345), 34.3) ok(round_to_sigfigs(4, 34.344), 34.34) ok(round_to_sigfigs(4, 134.344), 134.30) def generate_invoice(db, cust_id): ago = random.randrange(365 * 24 * 60 * 60) status = ('unsent' if ago < 24 * 60 * 60 * 3 else 'unpaid' if random.random() < .001 else 'paid') due_epoch_seconds = time.time() - ago + 30 * 24 * 60 * 60 due = time.strftime('%Y-%m-%d', time.gmtime(due_epoch_seconds)) return db.add_invoice(cust_id, due, status) def generate_line_item(db, invoice_id, product_row): productid, name, unitprice = product_row qty = int(1 + math.floor(random.expovariate(1))) db.add_line_item(qty, unitprice, productid, invoice_id) header_template = """ INVOICE %(status)s Due %(due)s %(name)s %(company)s %(streetaddress)s %(city)s, %(state)s %(zip)s """.lstrip() def display_invoice(db, invoice_id): (name, company, streetaddress, city, state, zip_, status, due ) = db.get_invoice_by_id(invoice_id) print((header_template % dict(name=name, company=company, streetaddress=streetaddress, city=city, state=state, zip=zip_, status='PAID' if status == 'paid' else '', due=due, ))) names, prices, qties, totals = zip(*db.get_line_items(invoice_id)) sum_totals = sum(totals) price_width = 9 qty_width = 5 widths = (2+len(max(names, key=len)), price_width, qty_width, price_width) prices = [('$' + str(price)).rjust(price_width-1) for price in prices] totals = [('$' + str(price)).rjust(price_width-1) for price in totals] qties = [str(qty).rjust(qty_width-1) for qty in qties] sum_totals = ('$' + str(sum_totals)).rjust(price_width-1) print_cols(widths, ('Product Name', 'Unit price', 'Quantity', 'Price')) print_cols(widths, ('', '', '', ''), pad='-', sep='+') for ii in range(len(names)): print_cols(widths, (' ' + names[ii], prices[ii], qties[ii], totals[ii])) print_cols(widths, ('', '', '', ''), pad='-', sep='+') print_cols(widths, ('%d items' % len(names), '', 'Total:', sum_totals), sep=' ') def print_cols(widths, values, pad=' ', sep='|'): sfields = [] for ii, (width, val) in enumerate(zip(widths, values)): if not isinstance(val, unicode): val = val.decode('utf-8') sfields.append(val.ljust(width, pad)[:width]) print(sep.join(sfields).encode('utf-8')) def ordinal(n): """Convert an int to an English ordinal. cf. """ assert n > 0 if n < 20: return '''first second third fourth fifth sixth seventh eighth ninth tenth eleventh twelfth thirteenth fourteenth fifteenth sixteenth seventeenth eighteenth nineteenth'''.split()[n-1] elif n < 100: tens = n//10 tens_base = 'twen thir for fif six seven eigh nine'.split()[tens-2] if n % 10 == 0: return tens_base + 'tieth' else: return tens_base + 'ty-' + ordinal(n % 10) else: suffix = ('st' if n % 10 == 1 else 'nd' if n % 10 == 2 else 'rd' if n % 10 == 3 else 'th') return str(n) + suffix ok(ordinal(1), 'first') ok(ordinal(2), 'second') ok(ordinal(9), 'ninth') ok(ordinal(11), 'eleventh') ok(ordinal(12), 'twelfth') ok(ordinal(18), 'eighteenth') ok(ordinal(20), 'twentieth') ok(ordinal(21), 'twenty-first') ok(ordinal(28), 'twenty-eighth') ok(ordinal(37), 'thirty-seventh') ok(ordinal(40), 'fortieth') ok(ordinal(56), 'fifty-sixth') ok(ordinal(99), 'ninety-ninth') ok(ordinal(100), '100th') ok(ordinal(101), '101st') ok(ordinal(166), '166th') ok(ordinal(172), '172nd') ok(ordinal(183), '183rd') # 128 surnames taken from # surnames = """ Adams Allen Almonte Anderson Araya Bailey Baker Barnes Bell Bennett Brooks Brown Butler Calderón Campbell Carter Castillo Clark Collins Cook Cooper Cox Cruz Davis De la Cruz De los Santos Díaz Edwards Evans Féliz Fernández Fisher Flores Foster Gagnon Garcia Gómez González Gray Green Gutiérrez Guzmán Hall Harris Hernández Herrera Hill Howard Hughes James Jenkins Johnson Jones Kelly King Lam Lee Lewis Li Long López Madrigal Martin Martinez Mejía Miller Mitchell Moore Mora Morales Morgan Morris Murphy Myers Nelson Nguyen Núñez Ortiz Parker Pérez Perry Peterson Phillips Polanco Powell Price Ramírez Reed Reyes Richardson Rivera Roberts Robinson Rodríguez Rogers Rojas Rosario Ross Roy Russell Sánchez Sanders Santana Santos Scott Segura Smith Stewart Sullivan Taylor Thomas Thompson Torres Tremblay Turner Valverde Vargas Vásquez Villalobos Walker Ward Watson White Williams Wilson Wood Wright Young """.strip().split('\n') assert len(surnames) == 128 # Names taken from # forenames = """ Aaron Adam Alan Albert Amanda Amber Amy Andrea Andrew Ann Anthony Arthur Austin Benjamin Beverly Billy Bobby Brian Brittany Bryan Carl Carol Carolyn Catherine Charles Christian Christina Cynthia Daniel Danielle Deborah Debra Dennis Diana Diane Donna Dorothy Douglas Dylan Edward Eric Ethan Eugene Evelyn Frances Gabriel George Gloria Grace Gregory Hannah Harold Heather Helen Howard Jack Jacob Jacqueline Janet Janice Jason Jean Jeffrey Jennifer Jeremy Jerry Joan Johnny Joshua Joyce Juan Judith Julie Karen Katherine Kathy Kelly Kenneth Kevin Kyle Larry Lauren Lawrence Linda Lisa Louis Maria Marie Marilyn Mark Martha Mary Matthew Melissa Michelle Natalie Nathan Nicholas Nicole Noah Olivia Pamela Patricia Patrick Paul Rachel Ralph Rebecca Rose Roy Russell Ruth Ryan Samantha Sarah Sean Shirley Stephen Steven Susan Teresa Terry Timothy Tyler Victoria Walter Wayne Zachary """.strip().split('\n') assert len(forenames) == 128 # Common US city names taken from # cities = """ Washington Springfield Franklin Greenville Bristol Clinton Fairview Salem Madison Georgetown Arlington Ashland Dover Oxford Jackson Burlington Manchester Milton Newport Auburn Centerville Clayton Dayton Lexington Milford Mount Vernon Oakland Winchester Cleveland Hudson Kingston Riverside """.strip().split('\n') # US state abbreviations from # states = """AL AK AZ AR CA CO CT DE FL GA HI ID IL IN IA KS LA ME MD MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR RI SC SD TN TX UT VT WA WV WI WY KY MA PA VA """.strip().split() assert len(states) == 50 # Some names crudely retyped from # # and # . common_street_names = """Main Park Oak Maple Pine Washington Cedar Elm Walnut Lake Sunset Lincoln Jackson Church River Willow Jefferson Center Lakeview Ridge Hickory Adams Cherry Highland Johnson Dogwood Chestnut Spruce Wilson Meadow Forest Hill Madison Mill Railroad """.strip().split() def generate_product(): if random.random() < .7: garment = random.choice(garments) color = random.choice(colors) fiber = random.choice(fibers) size = random.choice(sizes) adj = random.choice(adjectives) return ('%s %s %s %s, %s' % (color.capitalize(), fiber, adj, garment, size)) elif random.random() < .5: return ' '.join([random.choice(adjectives), random.choice(colors), random.choice(hard_materials), random.choice(toys)]).capitalize() else: return ' '.join([random.choice(colors), random.choice(adjectives), random.choice(hard_materials), random.choice(appliances)]).capitalize() colors = "white black red blue green yellow gray beige gold blue".split() fibers = "cotton polyester spandex wool".split() garments = "T-shirt boxers briefs pants blazer dress sweater".split() adjectives = """ultrasonic intimate surgical dynamic utility biological personal daily electronic""".split() sizes = """ child’s size ages 2–5 ages 3–10 size S size M size L size XL """.strip().split("\n") hard_materials = "plastic aluminum ceramic steel glass graphite".split() toys = "car helicopter doll bicycle trike truck".split() appliances = """ kettle microwave toaster oven blender toaster juicer skillet refrigerator rice cooker pressure cooker """.strip().split("\n") class PostgresInvoiceDb: def __init__(self, dsn): self.db = psycopg2.connect(dsn) self.cur = self.db.cursor() ddl = b""" drop table if exists lineitem; drop table if exists invoice; drop table if exists product; drop table if exists cust; create table cust ( id serial primary key , name varchar not null , company varchar , streetaddress varchar , city varchar , state varchar , zip varchar ); create table invoice ( id serial primary key , custid int references cust (id) not null , due timestamp , status varchar check (status in ('paid', 'unpaid', 'unsent')) ); create table product ( id serial primary key , name varchar not null , unitprice decimal(8, 2) not null ); create table lineitem ( qty int not null , unitprice decimal(8, 2) not null , productid int references product (id) not null , invoiceid int references invoice (id) not null ); -- Without this index, finding the line items for an invoice takes a -- second or two. create index lineitem_invoiceid on lineitem (invoiceid); -- This index speeds up finding unpaid invoices by about a factor of -- 5. create index invoice_status on invoice (status); -- It would probably make sense to have an index on invoice by custid, -- too, but so far the gains are fairly minimal. """ def recreate(self): self.cur.execute(self.ddl) self.commit() # The default is to rollback! def commit(self): self.db.commit() def add_cust(self, name, company, streetaddress, city, state, zip_): self.cur.execute("""insert into cust (name, company, streetaddress, city, state, zip) values (%s, %s, %s, %s, %s, %s) returning id""", (name, company, streetaddress, city, state, zip_)) ((cust_id,),) = self.cur.fetchall() return cust_id def get_cust_by_id(self, cust_id, fields): query = "select %s from cust where id = %%s" % ', '.join(fields) self.cur.execute(query, (cust_id,)) (row,) = self.cur.fetchall() return row def add_product(self, product, price): self.cur.execute("""insert into product (name, unitprice) values (%s, %s) returning id """, (product, price)) ((product_id,),) = self.cur.fetchall() return product_id def add_invoice(self, cust_id, due, status): self.cur.execute("""insert into invoice (custid, due, status) values (%s, %s, %s) returning id""", (cust_id, due, status)) ((invoice_id,),) = self.cur.fetchall() return invoice_id def add_line_item(self, qty, unitprice, productid, invoice_id): self.cur.execute("""insert into lineitem (qty, unitprice, productid, invoiceid) values (%s, %s, %s, %s)""", (qty, unitprice, productid, invoice_id)) def get_invoice_by_id(self, invoice_id): self.cur.execute("""select c.name, c.company, c.streetaddress, c.city, c.state, c.zip, invoice.status, invoice.due from cust as c join invoice on invoice.custid = c.id where invoice.id = %s """, (invoice_id,)) (row,) = self.cur.fetchall() return row def get_line_items(self, invoice_id): self.cur.execute("""select product.name, lineitem.unitprice, qty, lineitem.unitprice*qty from lineitem join product on lineitem.productid = product.id where invoiceid = %s """, (invoice_id,)) return self.cur.fetchall() def sql_escape(value): if isinstance(value, unicode): value = value.encode('utf-8') if isinstance(value, bytes): return b"'%s'" % value.replace(b"'", b"''") else: return str(value).encode('utf-8') class SqlScriptInvoiceDb: def __init__(self, output_file_object): self.out = output_file_object self.max_id = {} self.last_max_id = {} self.cust = {} self.out.write(b'begin;\n') ddl = PostgresInvoiceDb.ddl def recreate(self): self.out.write(self.ddl) self.commit() def commit(self): self.flush_max_id_changes() self.out.write(b'commit;\n') self.out.write(b'begin;\n') def flush_max_id_changes(self): for table, new_max in self.max_id.items(): if new_max == self.last_max_id.get(table, 1): continue self.out.write(b'select setval(%s, %d);\n' % (sql_escape(b'%s_id_seq' % table.encode('utf-8')), new_max)) self.last_max_id[table] = new_max def insert_id(self, table, fields, values): if table not in self.max_id: self.max_id[table] = 0 self.max_id[table] += 1 row_id = self.max_id[table] self.out.write(self.insert_statement( table, ('id',) + fields, (row_id,) + values)) return row_id def insert_statement(self, table, fields, values): return b"insert into %s (%s) values (%s);\n" % ( table.encode('utf-8'), b', '.join(field.encode('utf-8') for field in fields), b', '.join(map(sql_escape, values)), ) cust_fields = tuple('name company streetaddress city state zip'.split()) def add_cust(self, name, company, streetaddress, city, state, zip_): row = (name, company, streetaddress, city, state, zip_) row_id = self.insert_id('cust', self.cust_fields, row) self.cust[row_id] = row return row_id def get_cust_by_id(self, cust_id, fields): row = dict(zip(self.cust_fields, self.cust[cust_id])) return tuple(row[field] for field in fields) def add_product(self, product, price): return self.insert_id('product', ('name', 'unitprice'), (product, price)) def add_invoice(self, cust_id, due, status): return self.insert_id('invoice', ('custid', 'due', 'status'), (cust_id, due, status)) def add_line_item(self, qty, unitprice, productid, invoice_id): self.out.write(self.insert_statement( 'lineitem', 'qty unitprice productid invoiceid'.split(), (qty, unitprice, productid, invoice_id))) class LevelDbInvoiceDb: """Store the invoice data in LevelDB with the same indexing as in Postgres. But now we are responsible for updating the indexing, and for that matter encoding the data and separating the tables, ourselves. Here’s the key schema we’re using. - `cust 12345` is customer id 12345. The value is a JSON encoding of the list [name, company, streetaddress, city, state, zip], where zip is a number. - `cust id` is the next available customer id, as a decimal string. - `invoice 12345` is invoice id 12345. The value is a JSON encoding of the list [custid, due, status]. `custid` is a number; `due` is a number of seconds since 1969 UTC (the Unix epoch); `status` is a string. - `invoice id` is the next available invoice id, as a decimal string. - `product 12345` is product id 12345. The value is a JSON encoding of the list [name, unitprice]. `unitprice` is a number. - `product id` is the next available product id, as a decimal string. - `invoice 12345 item 3` is line item 3 of invoice 12345. The value is a JSON encoding of the list [qty, unitprice, productid], which are all numbers. The disadvantage of this key schema is that it makes it hard to fetch all the line items, for example to find all the line items for a given product; the invoices will be mixed in with them and need to be filtered out. Much worse, though, it means you can’t fetch all the invoices without fetching all the line items. This is totally fatal because it means you need an index on invoice for any kind of fast query, like by customer id. So this needs to change. - `invoice 12345 item id` is the next available line item id available within invoice 12345. - `invoice status="foo" 12345` is a key associated with an empty value which exists iff invoice 12345 has status "foo". "foo" is a JSON representation of the status field from that invoice. Protocol Buffers or CapnProto or FlatBuffers or Thrift or probably even marshal would of course be faster and more compact than JSON, but JSON on my laptop still runs at tens of megabytes per second, which is a lot faster than I’m generating this test data or, in fact, than LevelDB can store it. Using different native-Python codecs in place of JSON for tuple encoding produced these results: marshal: 24% shorter total run time (32% higher speed) cPickle: 12% shorter total run time (14% higher speed) This implies that JSON is in fact responsible for at least 24% of the program’s runtime at this point, and probably closer to 30%. """ codec = json # or marshal, or cPickle def __init__(self, filename): # XXX why is this data not getting Snappied? self.db = leveldb.LevelDB(filename) self.batch = leveldb.WriteBatch() # .Get on the LevelDB won’t return things that are in our # current write batch. So, to maintain read-our-writes # consistency within a transaction. we have to maintain our # own cache with at least things we haven’t yet written to the # database. self.cache = {} # To avoid the cost of deserializing things, we maintain a # cache of recently fetched rows in deserialized form; this is # simply managed by emptying it when it gets full, which is # much simpler than doing a better approximation of LRU and # performs well enough. self.row_cache = {} def recreate(self): for i, (k, v) in enumerate(self.db.RangeIter()): self.batch.Delete(k) if 0 == (i + 1) % 2048: self.commit() self.commit() for k, v in self.db.RangeIter(): assert "database" == "not empty", (k, v) self.put('cust id', '1') self.put('invoice id', '1') self.put('product id', '1') self.commit() def put(self, k, v): self.batch.Put(k, v) self.cache[k] = v def get(self, k): if k in self.cache: return self.cache[k] return self.db.Get(k) def commit(self): self.db.Write(self.batch) self.batch = leveldb.WriteBatch() self.cache.clear() def add_cust(self, name, company, streetaddress, city, state, zip_): return self.add_row('cust', [name, company, streetaddress, city, state, zip_,]) cust_fields = tuple('name company streetaddress city state zip'.split()) def get_cust_by_id(self, cust_id, fields): db_tuple = self.get_row('cust', cust_id) row = dict(zip(self.cust_fields, db_tuple)) return tuple(row[field] for field in fields) def add_product(self, product, price): return self.add_row('product', [product, price]) def add_invoice(self, cust_id, due, status): due = calendar.timegm(time.strptime(due, '%Y-%m-%d')) self.get_cust_by_id(cust_id, ()) # to validate foreign key constraint invoice_id = self.add_row('invoice', [cust_id, due, status]) self.put('invoice %d item id' % invoice_id, '1') self.put('invoice status=%s %d' % (json.dumps(status), invoice_id), '') return invoice_id def add_line_item(self, qty, unitprice, productid, invoice_id): # Validate foreign key constraint. This seems to be fairly # slow; with it this took about 15' before I added caching, # and with it commented out it only took about 9', while # caching has cut that back down to about 9'45". self.get_row('product', productid) # This will fail if the invoice doesn’t exist because the # counter won’t be initialized, thus implicitly checking the # other foreign key constraint. self.add_row('invoice %d item' % invoice_id, [qty, unitprice, productid]) def get_invoice_by_id(self, invoice_id): cust_id, due, status = self.get_row('invoice', invoice_id) due = time.strftime('%Y-%m-%d', time.gmtime(due)) (name, company, streetaddress, city, state, zip_ ) = self.get_row('cust', cust_id) return name, company, streetaddress, city, state, zip_, status, due def get_line_items(self, invoice_id): return list(self._get_line_items(invoice_id)) def _get_line_items(self, invoice_id): self.commit() base_key = 'invoice %d item ' % invoice_id bkl = len(base_key) for k, v in self.db.RangeIter(base_key): if not k.startswith(base_key) or not '0' <= k[bkl:bkl+1] <= '9': return qty, unitprice, productid = self.codec.loads(v) unitprice = decimal.Decimal('%.2f' % unitprice) name, _ = self.get_row('product', productid) yield name, unitprice, qty, unitprice*qty def add_row(self, table, values): row_id = self.allocate_id(table) self.put('%s %d' % (table, row_id), self.codec.dumps(values)) return row_id def get_row(self, table, row_id): try: return self.row_cache[table, row_id] except KeyError: row = self.codec.loads(self.get('%s %d' % (table, row_id))) if len(self.row_cache) > 16384: self.row_cache.clear() self.row_cache[table, row_id] = row return row def allocate_id(self, counter): key = '%s id' % counter new_id = int(self.get(key)) self.put(key, str(new_id + 1)) return new_id class SqliteInvoiceDb: """Should I make a SQLite3 version too? The DB-API makes it very similar to Postgres, but to connect() you supply a filename rather than some name-value pairs. It accepts but does not validate foreign key constraints. It does validate unique constraints on columns introduced with `primary key` and also with `create unique index`; it also supports `not null` constraints. It accepts `serial` as a column type but does not populate the column. It supports `drop table if exists`, but its `begin` and `commit` statements do not allow the optional `work`. Since I already have the code for calculating the ids rather than relying on a `serial` column in the SqlScriptInvoiceDb, I can feed this quickly in: $ ./exampledb.py -sql -n 64 64cust.sql $ time grep -v 'select setval' 64cust.sql | sqlite3 64cust.sqlite3 real 0m14.926s user 0m12.888s sys 0m0.488s Extrapolating, that’s 477 seconds, or just under 8 minutes, for the whole 2048-customer shebang. Which makes it seem like it would be super worthwhile. Indeed: $ time ./exampledb.py -sql >(grep -v 'select setval' | sqlite3 example.sqlite3) ... real 8m7.950s The resulting file is 321 mebibytes. The only real incompatibility with the Postgres implementation above is the `returning id` stuff. """ if __name__ == '__main__': main(sys.argv)