XML

From ActiveArchives
Jump to: navigation, search

parse + output XML

from xml.etree import ElementTree as ET
 
import argparse
 
ap=argparse.ArgumentParser("")
ap.add_argument("input")
args = ap.parse_args()
 
ET.register_namespace("", "http://www.w3.org/2000/svg")
with open(args.input) as f:
    t = ET.parse(f)
print (ET.tostring(t.getroot(), encoding="unicode"))

namespaces in search & output

from xml.etree import ElementTree as ET
 
import argparse, sys
 
ap=argparse.ArgumentParser("")
ap.add_argument("input")
ap.add_argument("--output", type=argparse.FileType('w'), default=sys.stdout)
args = ap.parse_args()
 
with open(args.input) as f:
    t = ET.parse(f)
 
# provide namespaces as mapper in calls to find, findall
for p in t.findall(".//svg:polyline", {"svg": "http://www.w3.org/2000/svg"}):
    print (p)
 
# nb: register_namespace only affects serialisation
ET.register_namespace("", "http://www.w3.org/2000/svg")
print (ET.tostring(t.getroot(), encoding="unicode"), file=args.output)

etreeutils.py

from __future__ import print_function
try:
    import lxml.etree as ET
except ImportError:
    from xml.etree import ElementTree as ET 
try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin
import sys
 
 
def innerHTML (elt):
    if elt.text != None:
        ret = elt.text
    else:
        ret = u""
    return ret + u"".join([ET.tostring(x, method="html", encoding="unicode") for x in elt])
 
def textContent (elt):
    if elt.text != None:
        ret = elt.text
    else:
        ret = u""
    return ret + u"".join([ET.tostring(x, method="text", encoding="utf8").decode("utf-8") for x in elt])
 
def hn (elt):
    """ iterator for all header elements """
    for n in range(5):
        h = elt.find(".//h{0}".format(n+1))
        if h != None:
            return h
 
def absolutize_links(t, baseurl):
    """ Use a baseurl to absolutize the links of an etree """
    for elt in t.findall(".//*[@href]"):
        elt.attrib['href'] = urljoin(baseurl, elt.attrib.get("href"))
    for elt in t.findall(".//*[@src]"):
        elt.attrib['src'] = urljoin(baseurl, elt.attrib.get("src"))
 
def parentchilditer (elt):
    for parent in elt.iter():
        for child in parent:
            yield parent, child
 
def parentchilditerwithindex (elt):
    for parent in elt.iter():
        for i, child in enumerate(parent):
            yield parent, child, i
 
def replace_elt (t, elt, tag):
    for p, c, i in parentchilditerwithindex(t):
        if c == elt:
            # print ("replacing {0} with {1}".format(elt.tag, tag), file=sys.stderr)
            newelt = ET.SubElement(p, tag)
            p.remove(elt)
            p.insert(i, newelt)
            return newelt
 
def containing_tags (elt, fromelt):
    ret = None
    if elt == fromelt:
        return [elt]
    for child in fromelt:
        nelts = containing_tags(elt, child)
        if nelts:
            ret = [fromelt] + nelts
    return ret
 
def is_header (elt):
    return re.search(r"^h\d$", elt.tag) != None
Personal tools
Namespaces

Variants
Actions
Navigation
Toolbox