#!/usr/bin/python
# -*- coding: utf-8 -*-
"""Sort hashtags by the information they provide.

I thought this might be a useful way to identify candidate filtering
criteria for logfiles or something, in order to prefer predicates that
are satisfied frequently enough to provide information about a useful
amount of the collection, but not so frequently that they provide a
uselessly small amount of information.  For example, given the
following input:

    #everything #is #half #cool
    #not #everything #is #half #awesome
    #my #cool #light #is #flickering #like #everything
    #stopwords #are #everything
    #everything #is #everything
    #not #half #everything #can #be #hashtagged

it correctly identifies #cool and #not as the most usefully selective
tags, followed by #half, while the useless #everything comes in last,
preceded by its runner-up in popularity, #is.

However, when I applied it to an actually meaningful file, all of my
tags were sufficiently infrequent that the ordering was exactly the
frequency ordering.  Bits per occurrence varied from 6.8 for the most
common tag down to 14 for the tags that only occurred once.

"""
import collections
import math
import re
import sys

def main(lines):
    tag_re = re.compile(r"#[^][\n'#,.!?/:;()<> \x80-\xff]+")
    tag_entropy, tag_counts = characterize(tag_re.findall(line) for line in lines)
    for tag in sorted(tag_entropy.keys(), key=tag_entropy.get, reverse=True):
        print "%8.2f %8d %s" % (tag_entropy[tag], tag_counts[tag], tag)

def characterize(items):
    pred_counts = collections.Counter()
    n = 0.0
    for item in items:
        n += 1
        for pred in set(item):
            pred_counts[pred] += 1

    pred_total_entropy = {pred: pred_counts[pred] * lg(n/pred_counts[pred])
                          for pred in pred_counts}
    return pred_total_entropy, pred_counts

def lg(n, base=math.log(2)):
    return math.log(n)/base

if __name__ == '__main__':
    main(sys.stdin)