#!/usr/bin/python # -*- coding: utf-8 -*- """Sort hashtags by the information they provide. I thought this might be a useful way to identify candidate filtering criteria for logfiles or something, in order to prefer predicates that are satisfied frequently enough to provide information about a useful amount of the collection, but not so frequently that they provide a uselessly small amount of information. For example, given the following input: #everything #is #half #cool #not #everything #is #half #awesome #my #cool #light #is #flickering #like #everything #stopwords #are #everything #everything #is #everything #not #half #everything #can #be #hashtagged it correctly identifies #cool and #not as the most usefully selective tags, followed by #half, while the useless #everything comes in last, preceded by its runner-up in popularity, #is. However, when I applied it to an actually meaningful file, all of my tags were sufficiently infrequent that the ordering was exactly the frequency ordering. Bits per occurrence varied from 6.8 for the most common tag down to 14 for the tags that only occurred once. """ import collections import math import re import sys def main(lines): tag_re = re.compile(r"#[^][\n'#,.!?/:;()<> \x80-\xff]+") tag_entropy, tag_counts = characterize(tag_re.findall(line) for line in lines) for tag in sorted(tag_entropy.keys(), key=tag_entropy.get, reverse=True): print "%8.2f %8d %s" % (tag_entropy[tag], tag_counts[tag], tag) def characterize(items): pred_counts = collections.Counter() n = 0.0 for item in items: n += 1 for pred in set(item): pred_counts[pred] += 1 pred_total_entropy = {pred: pred_counts[pred] * lg(n/pred_counts[pred]) for pred in pred_counts} return pred_total_entropy, pred_counts def lg(n, base=math.log(2)): return math.log(n)/base if __name__ == '__main__': main(sys.stdin)