Recipes

From ActiveArchives
Jump to: navigation, search

As of July 2018, I'm using this page to post "starting points" of code I think could be useful. Michael Murtaugh (talk) 14:00, 14 July 2018 (CEST)

Generic HTML filter

Just as it is, passing HTML content through html5lib's parser and outputting the resulting tree is a way of ensuring a clean HTML structure.

import html5lib
 
ap = argparse.ArgumentParser("")
ap.add_argument("--input", type=argparse.FileType('r'), default=sys.stdin)
ap.add_argument("--output", type=argparse.FileType('w'), default=sys.stdout)
args = ap.parse_args()
t = html5lib.parse(args.input, namespaceHTMLElements=False, treebuilder="etree")
# eventually manipulate t
print (ET.tostring(t, method="html", encoding="unicode"), file=args.output)

Show all the links in one or more HTML inputs

import argparse
import html5lib
 
ap = argparse.ArgumentParser("")
ap.add_argument("input", nargs="+")
args = ap.parse_args()
for n in args.input:
    with open(n) as f:
        t = html5lib.parse(f, namespaceHTMLElements=False)
        for a in t.findall(".//a[@href]"):
            print (a.attrib.get("href"))

Post etherbox event link fixer

# relink.py
import argparse
import html5lib
from urllib.parse import urlparse, quote as urlquote, unquote as urlunquote
import os, sys
from xml.etree import ElementTree as ET 
 
ap = argparse.ArgumentParser("")
ap.add_argument("--basepath", default=None, help="base path of local site, default cwd")
ap.add_argument("--mogrify", default=False, action="store_true", help="change files in place")
ap.add_argument("input", nargs="+")
args = ap.parse_args()
for n in args.input:
    changes = 0
    curpath, _ = os.path.split(n)
    with open(n) as f:
        t = html5lib.parse(f, namespaceHTMLElements=False)
        for a in t.findall(".//a[@href]"):
            href = a.attrib.get("href")
            if href.startswith("http://") or href.startswith("https://"):
                p = urlparse(href)
                if p.netloc == "etherbox.local":
                    # ABSOLUTE LOCAL LINK (directories + files)
                    path = urlunquote(p.path.lstrip("/"))
                    # calculate relative path to current
                    relhref = urlquote(os.path.relpath(path, curpath))
                    print (f"rel {href} -> {relhref}", file=sys.stderr)
                    a.attrib['href'] = relhref
                    changes += 1
                elif p.netloc == "etherbox.lan:9001" or p.netloc == "10.9.8.7:9001":
                    # ABSOLUTE PAD LINK
                    p = urlparse(href)
                    _, padname = os.path.split(p.path)
                    path = "etherdump/" + urlunquote(padname) + ".diff.html"
                    relhref = urlquote(os.path.relpath(path, curpath))
                    a.attrib['href'] = relhref
                    changes += 1
                    print (f"pad {href} -> {relhref}", file=sys.stderr)
            else:
                # RELATIVE LINK
                # print (href)
                pass
 
    if changes>0 and args.mogrify:
 
        os.rename(n, n+"~")
        with open(n, "w") as f:
            print (ET.tostring(t, method="html", encoding="unicode"), file=f)
Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox