Pretty HTML with BeautifulSoup

3 February 2018

A recipe to make every html file under the specified directory as pretty as can be. This Python script recursively visits every file and uses BeautifulSoup to parse and reformat it.

import argparse
import bs4
import glob

def prettify_file(filename):
    """Prettifies the specified html file, in-place. Keep a backup."""
    original = open(filename, "r").read()
    souped = bs4.BeautifulSoup(original, "lxml")
    out = open(filename, "w")
    out.write(souped.prettify())

def prettify_dir(dirname):
    """Recursively send all *.html files under dirname."""
    for filename in glob.glob(dirname+"/**/*.html", recursive=True):
        prettify_file(filename)

desc = "Prettify all html files under the specified directory, in-place."
parser = argparse.ArgumentParser(description=desc)
args = parser.add_argument("directory")
args = parser.parse_args()
prettify_dir(args.directory)

Albatross

Versions of Python before 3.5 didn’t support recursive glob, so upgrade or use this function instead:

import os
def prettify_dir(dirname):
    """Recursively prettify all *.html files under dirname."""
    for root, dirs, files in os.walk(dirname):
        for name in files:
            if name.endswith('.html'):
                prettify_file(os.path.join(root, name))

html
python