#!/usr/bin/python # -*- coding: utf-8 -*- """Operations on finite functions for a principled APL. Based on , with inspiration from Pandas and of course SQL, Kdb+, and the stock market. We start with these basic ideas: * a variable is a function of circumstances, of some subset of the coordinates in a many-dimensional space whose coordinates are mostly unknown; * and some of these functions depend on some coordinates, while others depend on other coordinates; and some are only valid in certain parts of the space, while others are valid everywhere; * and furthermore that it makes sense to reduce a function across a dimension with a semigroup operation (such as +, *, and, or, min, or max) to get a new function that doesn’t depend on that dimension; XXX does semigroup make sense for nominal dimensions? * and for ordinal dimensions, we can additionally reduce with (XXX what kind of?) operations like first and last; * that a constant is merely a function that happens to depend on no coordinates at all; * that we can introduce new dimensions at will, using the ι function on the dimension to get a function that depends on it; * that, for dimensions of at least ordinal scale, we can not only reduce along the dimension but also calculate a scan (aka prefix sum, running sum, cumulative sum, sum table) for that function across that dimension with, in principle, any binary operation valid for the function’s values, but in practice generally only semigroup operations; * and, for functions whose value is of at least ordinal scale, we can compute the “grade-up” of that function along any dimension, producing a new dimension that is the ordering of that function along that dimension; * and, finally, that we can *index* a function by another function along some dimension, as long as the other function’s values are valid values of that dimension, to get the composition of the functions, which may be optimized by the “gather” SIMD operation, like the SSE3 `shuffle` instruction, as explained in . This gives us a relatively compact and powerful formalism similar to APL, but more principled. It can in principle handle functions that are defined at arbitrary subsets of the coordinate space, not just rectangular subsets of it. There are a few key benefits of writing your program in this way: 1. You can amortize interpretive and type-dispatch overhead to a great extent. This benefit is somewhat vitiated by needing to worry about whether you are accidentally computing large amounts of data you won’t use, but see below. 2. Not only don’t you have to write foreach loops explicitly, but in fact your code can remain unchanged even when the set of things you're looping over changes. Maybe today, ticksize depends only on the security, but tomorrow the ticksize of yen futures contracts changes, and now ticksize depends both on the security and the trading date, or rather some kind of trading epoch with initially only two values. All the functions you wrote previously that compute with ticksizes now depend on the trading epoch as well as the security, but this happens without you having to add a parameter to them. That is to say, since new dimensions can be introduced at any later time, what looks like a point now can turn out to be a hyperplane later. 3. Yeah, the code is short. But efficiency and maintainability are more important. Don’t get seduced by brevity. It’s dangerous. 4. You can change the evaluation strategy for a given dimension. For example, it might turn out that you have more data than will fit in RAM. Maybe you can reduce your working set size by making one or more of the dimensions temporal, so that at any given time, only data that doesn’t depend on those dimensions, or data for the current value of those dimensions, is in RAM. Or maybe, although the data fits in RAM, you’re thrashing your cache, so you can induce the system to “tile” into 8×8 blocks of coordinates along two of the dimensions, improving locality of reference. All this without changing the application code describing the function to be computed. All of this is pretty speculative until there’s a working system, though. Current status of that: you can use arbitrary operations on constants and iotas, and reduce with arbitrary operations over any dimension, but no composition yet, and you can select the points where a boolean holds. And of course this is all CPython so you lose the potential efficiency advantage, and furthermore it doesn’t take advantage of regularity that may occur in the dimensions of a function, for example, if it fills a whole rectangular volume with no holes. """ import operator class Function: def __init__(self, dimensions, abscissas, ordinate): self.dimensions = tuple(dimensions) self.abscissas = [tuple(a) for a in abscissas] self.ordinate = ordinate def abscissas_along_dimension(self, dimension): index = self.dimensions.index(dimension) # otherwise, the universe return [abscissa[index] for abscissa in self.abscissas] def as_table_cells(self): yield self.dimensions + ('',) for d, o in zip(self.abscissas, self.ordinate): yield d + (o,) def __str__(self): rows = [map(str, row) for row in self.as_table_cells()] if not rows: return '(function defined nowhere)' widths = [max(len(row[i]) for row in rows) for i in range(len(rows[0]))] return ''.join(' | '.join(cell.rjust(width) for cell, width in zip(row, widths)) + '\n' for row in rows) def get_at(self, dimensions, values): # Figure out where in the query to find each of the columns of # our abscissa. Also ensures that this query is for at most # one point. (Otherwise use __getitem__!) query_indexes = [dimensions.index(d) for d in self.dimensions] # Now rearrange the query to conform to our abscissa. values = tuple([values[i] for i in query_indexes]) for i in range(len(self.abscissas)): assert type(self.abscissas[i]) is type(values) if self.abscissas[i] == values: return self.ordinate[i] def sum(self, dimension): return self.reduce(dimension, operator.add) def product(self, dimension): return self.reduce(dimension, operator.mul) def max(self, dimension): return self.reduce(dimension, max) def min(self, dimension): return self.reduce(dimension, min) def all(self, dimension): return self.reduce(dimension, operator.__and__) def any(self, dimension): return self.reduce(dimension, operator.__or__) def reduce(self, dimension, op): if dimension not in self.dimensions: return self idx = self.dimensions.index(dimension) columns = transpose(self.abscissas) da, db = self.dimensions[:idx], self.dimensions[idx+1:] ca, cb = columns[:idx], columns[idx+1:] r_abscissas = uniq(column_cross_product(ca + cb)) ordinate_dict = {} for a, o in zip(self.abscissas, self.ordinate): na = a[:idx] + a[idx+1:] # new abscissa if na in ordinate_dict: ordinate_dict[na] = op(ordinate_dict[na], o) else: ordinate_dict[na] = o return Function(da + db, r_abscissas, [ordinate_dict[a] for a in r_abscissas]) def where(self, boolean): boolean = as_function(boolean) # XXX this is not really the right thing to do. Instead we # should add the missing dimensions into the result! assert not any(d not in self.dimensions for d in boolean.dimensions), ( "Reduce your boolean from dimensions %s to dimensions %s first" % (boolean.dimensions, self.dimensions)) ra = [] ro = [] for a, o in zip(self.abscissas, self.ordinate): b = boolean.get_at(self.dimensions, a) if b is not None and b: ra.append(a) ro.append(o) return Function(self.dimensions, ra, ro) def __getitem__(self, other): other = as_function(other) XXX def make_broadcasting_op(scalar_op): def broadcasting_op(self, other): other = as_function(other) dimensions, abscissas = dimension_union(self.dimensions, other.dimensions, self.abscissas, other.abscissas) r_abscissas = [] ordinate = [] for a in abscissas: av = self.get_at(dimensions, a) bv = other.get_at(dimensions, a) if av is not None and bv is not None: r_abscissas.append(a) ordinate.append(scalar_op(av, bv)) return Function(dimensions, r_abscissas, ordinate) return broadcasting_op def make_unary_broadcasting_op(scalar_op): return lambda self: Function(self.dimensions, self.abscissas, [scalar_op(item) for item in self.ordinate]) def _add_broadcasting_ops(): for name in ['__add__', '__sub__', '__mul__', '__div__', '__and__', '__eq__', '__ge__', '__gt__', '__le__', '__lshift__', '__lt__', '__mod__', '__ne__', '__or__', '__pow__', '__rshift__', '__truediv__', '__xor__']: setattr(Function, name, make_broadcasting_op(getattr(operator, name))) for name, op in [('__radd__', operator.add), ('__rsub__', lambda a, b: b - a), ('__rmul__', operator.mul), ('__rdiv__', lambda a, b: b / a), ]: setattr(Function, name, make_broadcasting_op(op)) for name in ['__abs__', '__pos__', '__invert__', '__neg__']: setattr(Function, name, make_unary_broadcasting_op(getattr(operator, name))) _add_broadcasting_ops() def as_function(obj): if isinstance(obj, Function): return obj # By default return a constant function. return Function([], [()], [obj]) def dimension_union(d1, d2, a1, a2): d3 = list(d1[:]) for d in d2: if d not in d3: d3.append(d) a1_columns = transpose(a1) a2_columns = transpose(a2) r_columns_1 = [intersect(uniq(a1_column), uniq(a2_columns[d2.index(d)])) if d in d2 else uniq(a1_column) for d, a1_column in zip(d1, a1_columns)] r_columns_2 = [uniq(a2_columns[d2.index(d)]) for d in d3[len(d1):]] return d3, column_cross_product(r_columns_1 + r_columns_2) def column_cross_product(columns): return list(_column_cross_product(columns)) def _column_cross_product(columns): if not columns: yield () return for item in columns[0]: for cross in _column_cross_product(columns[1:]): yield (item,) + cross def transpose(rows): return zip(*rows) def uniq(column): return list(_uniq(column)) def _uniq(column): seen = set() for item in column: if item not in seen: yield item seen.add(item) def intersect(a, b): return list(_intersect(a, b)) def _intersect(a, b): s = set(b) for item in a: if item in s: yield item class Dimension: def __init__(self, values, name=None): self.values = values if name is None: name = generate_dimension_name() self.name = name def __str__(self): return self.name def __repr__(self): vr = ' [%s..%s]' % (self.values[0], self.values[-1]) if self.values else '' return '' % (self.name, vr, len(self.values)) def iota(self): return Function([self], [[v] for v in self.values], self.values) dimension_digits = 'foo bar baz to but too mad'.split() dimension_counter = 0 def generate_dimension_name(): global dimension_counter n = dimension_counter dimension_counter += 1 rv = [] while True: n, m = divmod(n, len(dimension_digits)) rv.append(dimension_digits[m]) if not n: break return ''.join(rv) if __name__ == '__main__': d = Dimension(range(5), 'd') e = Dimension(range(6), 'e') k = as_function(9) i = d.iota() j = e.iota() print i print i * i print k print i * k print k * i print i * j print i + 1 print 2 + j print i / (i + 1.0) print j & j + 1 print 1 - i print i * 2 > i print abs(j - 2) print ~i print (i * j).sum(d) print (i**2).sum(d) print (i*i).sum(d) print (i+1).product(d) # Factorial! f = Dimension(range(12), 'f') g = Dimension(range(12), 'g') fi = f.iota() gi = g.iota() print ((fi < gi) * fi + 1).product(fi) # Many factorials! x = fi * gi print x.where(fi > gi) # A better way to do factorial that doesn’t work yet print (fi.where(fi < gi) + 1).product(fi)