#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Operations on finite functions for a principled APL.
Based on , with
inspiration from Pandas and of course SQL, Kdb+, and the stock market.
We start with these basic ideas:
* a variable is a function of circumstances,
of some subset of the coordinates in a many-dimensional space whose
coordinates are mostly unknown;
* and some of these functions depend on
some coordinates, while others depend on other coordinates; and some
are only valid in certain parts of the space, while others are valid
everywhere;
* and furthermore that it makes sense to reduce a function
across a dimension with a semigroup operation (such as +, *, and, or,
min, or max) to get a new function that doesn’t depend on that
dimension; XXX does semigroup make sense for nominal dimensions?
* and for ordinal dimensions, we can additionally reduce with
(XXX what kind of?) operations like first and last;
* that a constant is merely a function that happens to depend
on no coordinates at all;
* that we can introduce new dimensions at
will, using the ι function on the dimension to get a function that
depends on it;
* that, for dimensions of at least ordinal scale, we can
not only reduce along the dimension but also calculate a scan
(aka prefix sum, running sum, cumulative sum, sum table)
for that function across that
dimension with, in principle, any binary operation valid for the
function’s values, but in practice generally only semigroup
operations;
* and, for functions whose value is of at least ordinal scale, we can
compute the “grade-up” of that function along any dimension, producing a
new dimension that is the ordering of that function along that dimension;
* and, finally, that we can *index* a function by another
function along some dimension,
as long as the other function’s values are valid values of that dimension,
to get the composition of the functions,
which may be optimized by the “gather” SIMD operation,
like the SSE3 `shuffle` instruction,
as explained in
.
This gives us a relatively compact and powerful formalism similar to
APL, but more principled.
It can in principle handle functions that
are defined at arbitrary subsets of the coordinate space,
not just rectangular subsets of it.
There are a few key benefits of writing your program in this way:
1. You can amortize interpretive and type-dispatch
overhead to a great extent. This
benefit is somewhat vitiated by needing to worry about whether you are
accidentally computing large amounts of data you won’t use, but see below.
2. Not only don’t you have to write foreach loops explicitly, but in
fact your code can remain unchanged even when the set of things you're
looping over changes. Maybe today, ticksize depends only on the
security, but tomorrow the ticksize of yen futures contracts changes,
and now ticksize depends both on the security and the trading date, or
rather some kind of trading epoch with initially only two values. All
the functions you wrote previously that compute with ticksizes now
depend on the trading epoch as well as the security, but this happens
without you having to add a parameter to them. That is to say, since
new dimensions can be introduced at any later time, what looks like a
point now can turn out to be a hyperplane later.
3. Yeah, the code is short. But efficiency and maintainability are
more important. Don’t get seduced by brevity. It’s dangerous.
4. You can change the evaluation strategy for a given dimension. For
example, it might turn out that you have more data than will fit in
RAM. Maybe you can reduce your working set size by making one or more of the
dimensions temporal, so that at any given time, only data that doesn’t
depend on those dimensions, or data for the current value of those
dimensions, is in RAM. Or maybe, although the data fits in RAM, you’re
thrashing your cache, so you can induce the system to “tile” into 8×8
blocks of coordinates along two of the dimensions, improving locality
of reference. All this without changing the application code
describing the function to be computed.
All of this is pretty speculative until there’s a working system,
though.
Current status of that: you can use arbitrary operations
on constants and iotas,
and reduce with arbitrary operations over any dimension,
but no composition yet,
and you can select the points where a boolean holds. And of
course this is all CPython so you lose the potential efficiency
advantage,
and furthermore it doesn’t take advantage of regularity
that may occur in the dimensions of a function,
for example, if it fills a whole rectangular volume with no holes.
"""
import operator
class Function:
def __init__(self, dimensions, abscissas, ordinate):
self.dimensions = tuple(dimensions)
self.abscissas = [tuple(a) for a in abscissas]
self.ordinate = ordinate
def abscissas_along_dimension(self, dimension):
index = self.dimensions.index(dimension) # otherwise, the universe
return [abscissa[index] for abscissa in self.abscissas]
def as_table_cells(self):
yield self.dimensions + ('',)
for d, o in zip(self.abscissas, self.ordinate):
yield d + (o,)
def __str__(self):
rows = [map(str, row) for row in self.as_table_cells()]
if not rows:
return '(function defined nowhere)'
widths = [max(len(row[i]) for row in rows) for i in range(len(rows[0]))]
return ''.join(' | '.join(cell.rjust(width)
for cell, width in zip(row, widths))
+ '\n'
for row in rows)
def get_at(self, dimensions, values):
# Figure out where in the query to find each of the columns of
# our abscissa. Also ensures that this query is for at most
# one point. (Otherwise use __getitem__!)
query_indexes = [dimensions.index(d) for d in self.dimensions]
# Now rearrange the query to conform to our abscissa.
values = tuple([values[i] for i in query_indexes])
for i in range(len(self.abscissas)):
assert type(self.abscissas[i]) is type(values)
if self.abscissas[i] == values:
return self.ordinate[i]
def sum(self, dimension):
return self.reduce(dimension, operator.add)
def product(self, dimension):
return self.reduce(dimension, operator.mul)
def max(self, dimension):
return self.reduce(dimension, max)
def min(self, dimension):
return self.reduce(dimension, min)
def all(self, dimension):
return self.reduce(dimension, operator.__and__)
def any(self, dimension):
return self.reduce(dimension, operator.__or__)
def reduce(self, dimension, op):
if dimension not in self.dimensions:
return self
idx = self.dimensions.index(dimension)
columns = transpose(self.abscissas)
da, db = self.dimensions[:idx], self.dimensions[idx+1:]
ca, cb = columns[:idx], columns[idx+1:]
r_abscissas = uniq(column_cross_product(ca + cb))
ordinate_dict = {}
for a, o in zip(self.abscissas, self.ordinate):
na = a[:idx] + a[idx+1:] # new abscissa
if na in ordinate_dict:
ordinate_dict[na] = op(ordinate_dict[na], o)
else:
ordinate_dict[na] = o
return Function(da + db, r_abscissas,
[ordinate_dict[a] for a in r_abscissas])
def where(self, boolean):
boolean = as_function(boolean)
# XXX this is not really the right thing to do. Instead we
# should add the missing dimensions into the result!
assert not any(d not in self.dimensions
for d in boolean.dimensions), (
"Reduce your boolean from dimensions %s to dimensions %s first" %
(boolean.dimensions, self.dimensions))
ra = []
ro = []
for a, o in zip(self.abscissas, self.ordinate):
b = boolean.get_at(self.dimensions, a)
if b is not None and b:
ra.append(a)
ro.append(o)
return Function(self.dimensions, ra, ro)
def __getitem__(self, other):
other = as_function(other)
XXX
def make_broadcasting_op(scalar_op):
def broadcasting_op(self, other):
other = as_function(other)
dimensions, abscissas = dimension_union(self.dimensions, other.dimensions,
self.abscissas, other.abscissas)
r_abscissas = []
ordinate = []
for a in abscissas:
av = self.get_at(dimensions, a)
bv = other.get_at(dimensions, a)
if av is not None and bv is not None:
r_abscissas.append(a)
ordinate.append(scalar_op(av, bv))
return Function(dimensions, r_abscissas, ordinate)
return broadcasting_op
def make_unary_broadcasting_op(scalar_op):
return lambda self: Function(self.dimensions, self.abscissas,
[scalar_op(item) for item in self.ordinate])
def _add_broadcasting_ops():
for name in ['__add__', '__sub__', '__mul__', '__div__', '__and__',
'__eq__', '__ge__', '__gt__', '__le__', '__lshift__',
'__lt__', '__mod__', '__ne__', '__or__', '__pow__',
'__rshift__', '__truediv__', '__xor__']:
setattr(Function, name, make_broadcasting_op(getattr(operator, name)))
for name, op in [('__radd__', operator.add),
('__rsub__', lambda a, b: b - a),
('__rmul__', operator.mul),
('__rdiv__', lambda a, b: b / a),
]:
setattr(Function, name, make_broadcasting_op(op))
for name in ['__abs__', '__pos__', '__invert__', '__neg__']:
setattr(Function, name,
make_unary_broadcasting_op(getattr(operator, name)))
_add_broadcasting_ops()
def as_function(obj):
if isinstance(obj, Function):
return obj
# By default return a constant function.
return Function([], [()], [obj])
def dimension_union(d1, d2, a1, a2):
d3 = list(d1[:])
for d in d2:
if d not in d3:
d3.append(d)
a1_columns = transpose(a1)
a2_columns = transpose(a2)
r_columns_1 = [intersect(uniq(a1_column), uniq(a2_columns[d2.index(d)]))
if d in d2 else uniq(a1_column)
for d, a1_column in zip(d1, a1_columns)]
r_columns_2 = [uniq(a2_columns[d2.index(d)]) for d in d3[len(d1):]]
return d3, column_cross_product(r_columns_1 + r_columns_2)
def column_cross_product(columns):
return list(_column_cross_product(columns))
def _column_cross_product(columns):
if not columns:
yield ()
return
for item in columns[0]:
for cross in _column_cross_product(columns[1:]):
yield (item,) + cross
def transpose(rows):
return zip(*rows)
def uniq(column):
return list(_uniq(column))
def _uniq(column):
seen = set()
for item in column:
if item not in seen:
yield item
seen.add(item)
def intersect(a, b):
return list(_intersect(a, b))
def _intersect(a, b):
s = set(b)
for item in a:
if item in s:
yield item
class Dimension:
def __init__(self, values, name=None):
self.values = values
if name is None:
name = generate_dimension_name()
self.name = name
def __str__(self):
return self.name
def __repr__(self):
vr = ' [%s..%s]' % (self.values[0], self.values[-1]) if self.values else ''
return '' % (self.name, vr, len(self.values))
def iota(self):
return Function([self], [[v] for v in self.values], self.values)
dimension_digits = 'foo bar baz to but too mad'.split()
dimension_counter = 0
def generate_dimension_name():
global dimension_counter
n = dimension_counter
dimension_counter += 1
rv = []
while True:
n, m = divmod(n, len(dimension_digits))
rv.append(dimension_digits[m])
if not n:
break
return ''.join(rv)
if __name__ == '__main__':
d = Dimension(range(5), 'd')
e = Dimension(range(6), 'e')
k = as_function(9)
i = d.iota()
j = e.iota()
print i
print i * i
print k
print i * k
print k * i
print i * j
print i + 1
print 2 + j
print i / (i + 1.0)
print j & j + 1
print 1 - i
print i * 2 > i
print abs(j - 2)
print ~i
print (i * j).sum(d)
print (i**2).sum(d)
print (i*i).sum(d)
print (i+1).product(d) # Factorial!
f = Dimension(range(12), 'f')
g = Dimension(range(12), 'g')
fi = f.iota()
gi = g.iota()
print ((fi < gi) * fi + 1).product(fi) # Many factorials!
x = fi * gi
print x.where(fi > gi)
# A better way to do factorial that doesn’t work yet
print (fi.where(fi < gi) + 1).product(fi)