#!/usr/bin/python3 """Organize the posts in https://subsymbol.org/journal by week. The index page on the site is currently broken but the source form of the posts can be obtained with git clone https://github.com/ambimorph/subsymbol.org.git Some are in HTML; others are in reStructuredText. This program writes them as HTML to files named 'byweek-02014-23.html', etc. Tested in Python 3.8. """ import datetime, os, pprint, re, sys import docutils.core, bs4 # We have to use the filenames to get the dates because many of the # posts are missing .meta files. In one case (in 02014) the year is # omitted, and in another case the ordinal suffix is omitted. fname_pattern = re.compile(r''' .* (?P sunday | monday | tuesday | wednesday | thursday | friday | saturday) - (?P february | june | july | august) - (?P \d+) (?: st | nd | rd | th)? (?: - (?P \d+))? \. (?: rst | html) \Z ''', re.X) months = dict(february=2, june=6, july=7, august=8) def get_date(filename): "Given a filename, return the datetime when the journal entry was posted." mo = fname_pattern.match(filename) year = mo.group('year') if year is None: year = 2014 dt = datetime.datetime(year=int(year), month=months[mo.group('month')], day=int(mo.group('day'))) # The post for 02014-07-03 was incorrectly marked as "June", but # the .meta and the git log of protagonist (commit 8d3c468) tells # us it was actually written in July. if dt == datetime.datetime(2014, 6, 3): dt = dt.replace(month=7) # Yeah, I know assert is not a good way to handle errors in input # data... but in this case the input data is fixed and the program # is variable. wday = dt.strftime("%A").lower() assert wday == mo.group('wday'), (wday, mo.group('wday'), dt) return dt def get_week(filename): "Return the week such as '02014-26' when the journal entry was posted." # %U is a Sunday-origin week return get_date(filename).strftime("0%Y-%U") def posts(dirname='.'): """Return (week, datetime, filename) tuples in chronological order. Only includes .rst and .html files. Ignores .meta. """ candidates = [os.path.join(dirname, filename) for filename in os.listdir(dirname) if not filename.startswith('byweek-') and (filename.endswith('.html') or filename.endswith('.rst'))] return sorted([(get_week(f), get_date(f), f) for f in candidates]) def get_html(filename): "Render contents of named file as HTML, returning a Unicode string." with open(filename) as fo: data = fo.read() # XXX what charset will this use? if filename.endswith('.rst'): htmlbytes = docutils.core.publish_string(data, writer_name='html') return htmlbytes.decode('utf-8') else: return data def main(argv): dirname = '.' if len(argv) < 2 else argv[1] last_week = None for week, dt, filename in posts(dirname): print(week, dt, filename) assert week != None if week != last_week: # XXX we are depending on Python's GC to flush these files # in the finalizer output = open('byweek-{}.html'.format(week), 'w') last_week = week output.write(""" Posts for week of {} """.format(week)) output.write('

{}

\n'.format(filename)) # XXX soup.body.prettify badly mangles because it doesn't know whitespace is significant; # not sure I've avoided such mangling by *not* prettifying, # but I guess I'll fix it later if I notice it... soup = bs4.BeautifulSoup(get_html(filename), features='lxml') # https://stackoverflow.com/questions/21452823/beautifulsoup-how-should-i-obtain-the-body-contents soup.body.hidden = True output.write(str(soup.body)) if __name__ == '__main__': main(sys.argv)