Module minidom_ext
Functions to improve xml.dom.minidom tools in Python.
Expand source code
#!/usr/bin/env python3.7
# -*- coding: utf-8 -*-
"""
Functions to improve xml.dom.minidom tools in Python.
"""
import os
from xml.dom.minidom import Node, Element, parse, parseString
import re
from lxml import etree # http://lxml.de/index.html#documentation
# pdoc3 --html --force minidom_ext.py
#==================================================
#============ Tools ===============================
#==================================================
def _existFile(f):
""" tests if the file exists """
return os.path.isfile(f)
def _existDir(d):
""" tests if the directory exists """
return os.path.exists(d)
#==================================================
#============ class DOMCompanion ==================
#==================================================
class DOMCompanion :
"""
Functions to improve xml.dom.minidom tools in Python.
"""
# ===========================================================================================
def __init__(self, doc = None) :
"""
class constructor.
Parameters
----------
doc : Node.DOCUMENT_NODE, optional
DOM structure
Notes
-----
The DOM is also enriched with default attributes if a DTD is specified
"""
self.doc = doc
"""
the DOM structure : Node.DOCUMENT_NODE
"""
self.documentElement = None
"""
equivalent to doc.documentElement : Node.ELEMENT_NODE
"""
self.lid = dict()
if doc is not None :
self.documentElement = doc.documentElement
self.enrichXML()
# ===========================================================================================
def parse(self, file, validate = False):
"""
to load an XML file
Parameters
----------
file : str
file that contains the XML file to load
validate : boolean, optional
flag to validate the XML file if it contains a Doctype section
Returns
-------
boolean
the DOM is valid or not according to the specified DTD
True if there is no DTD
See Also
--------
`DOMCompanion.validate`
Notes
-----
if a DTD is specified, uses it to add default attributes and to collect IDs
See https://docs.python.org/3/library/xml.dom.minidom.html
"""
if _existFile(file) :
self.doc = parse(file)
self.documentElement = self.doc.documentElement
if validate :
if self.validate() :
return True
else :
self._enrichXML()
return False
else :
self._enrichXML()
return True
# ===========================================================================================
def parseString(self, xml, validate = False):
"""
to load an XML string
Parameters
----------
xml : str
the string that contains the XML
validate : boolean, optional
flag to validate the XML file if it contains a Doctype section
Returns
-------
boolean
the DOM is valid or not according to the specified DTD
True if there is no DTD
See Also
--------
`DOMCompanion.validate`
Notes
-----
if a DTD is specified, uses it to add default attributes and to collect IDs
See https://docs.python.org/3/library/xml.dom.minidom.html
"""
self.doc = parseString(xml)
self.documentElement = self.doc.documentElement
if validate :
if self.validate() :
return True
else :
self._enrichXML()
return False
else :
self._enrichXML()
return True
# ===========================================================================================
def getElementsByTagName(self, name) :
"""
the DOM getElementsByTagName
Parameters
----------
name : str
the Element to find in the DOM
Returns
-------
NodeList or None
a list of elements or None
"""
if self.doc is not None :
return self.doc.getElementsByTagName(name)
else:
return None
# ===========================================================================================
def getElementById(self, id) :
"""
to retrieve an element by its ID
Parameters
----------
id : str
the ID of the element to find
Returns
-------
Node.ELEMENT_NODE or None
the element or None
"""
if id in self.lid.keys() :
return self.lid[id]
else :
return None
# ===========================================================================================
def getAttributsByIdref(self, id) :
"""
to retrieve IDREF attributs with an ID
Parameters
----------
id : str
the ID to looking for
Returns
-------
List(Attr)
the attributs
"""
# if id in self.lid.keys() :
# return self.lid[id]
# else :
return self._getIdrefs(self.doc.documentElement, id)
# ===========================================================================================
def toLighter(self, del_spaces = True, del_comments = True, del_pi = True) :
"""
to suppress text nodes (with only separators), processing instructions and/or comments
Parameters
----------
del_spaces : boolean, optional
to suppress blank nodes (with only newline, tabulation et space caracters)
del_comments : boolean, optional
to suppress comment nodes
del_spaces : boolean, optional
to suppress processing instruction nodes
Returns
-------
DOMCompaniom
itself
"""
if self.doc is not None :
self._purgeDOM(self.doc, del_spaces, del_comments, del_pi)
return self
# ===========================================================================================
def validate(self) :
"""
to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document.
Returns
-------
boolean
the DOM is valid or not according to the specified DTD
"""
if self.doc is not None :
parser = etree.XMLParser(recover=True, strip_cdata=True)
tree = etree.XML(self.doc.toxml(), parser)
dtdFile = self._getDTDFile()
if dtdFile is not None :
if _existFile(dtdFile) :
dtd = etree.DTD(dtdFile)
if dtd.validate(tree) :
self._enrichXML()
return True
else :
print(dtd.error_log.filter_from_errors()[0])
return False
else :
print('Unable to find the DTD file ',dtdFile)
return False
else:
self._enrichXML()
return True
else :
return False
# ===========================================================================================
def toxml(self) :
"""
produce XML string
Returns
-------
str
the XML string
Notes
-----
See https://docs.python.org/3/library/xml.dom.minidom.html
"""
return self.doc.toxml()
# ===========================================================================================
def toprettyxml(self,indent="\t", newl="\n", encoding=None, standalone=None) :
"""
produce pretty-printed version of the XML string
Returns
-------
str
the XML string
Notes
-----
See https://docs.python.org/3/library/xml.dom.minidom.html
"""
return self.doc.toprettyxml(ident, newl, encoding, standalone)
# ===========================================================================================
#####################################
########## private methods ##########
#####################################
def _getDTDFile(self) :
if self.doc.doctype is not None :
if self.doc.doctype.systemId is not None :
return self.doc.doctype.systemId
else : return None
else : return None
def _enrichXML(self) :
if self.doc is not None :
self.lid = dict()
dtdFile = self._getDTDFile()
if dtdFile is not None :
if _existFile(dtdFile) :
le = self._extractDTD(dtdFile)
self._enrichNode(self.doc.documentElement, le)
else :
print('Unable ti find the DTD file ',dtdFile)
else :
self._enrichNode(self.doc.documentElement, dict())
def _purgeDOM(self, no, del_spaces, del_comments, del_pi) :
if no.nodeType in [Node.ELEMENT_NODE, Node.DOCUMENT_NODE] :
toDel = []
for n in no.childNodes :
if del_spaces and n.nodeType == Node.TEXT_NODE and n.data.strip('\t \n') == '' :
toDel.append(n)
elif del_comments and n.nodeType == Node.COMMENT_NODE :
toDel.append(n)
elif del_pi and n.nodeType == Node.PROCESSING_INSTRUCTION_NODE :
toDel.append(n)
elif n.nodeType == Node.ELEMENT_NODE :
self._purgeDOM(n,del_spaces,del_comments, del_pi)
for n in toDel :
no.removeChild(n)
elif no.nodeType == Node.DOCUMENT_TYPE_NODE :
pass
else :
pass
return no
def _getIdrefs(self, no, value) :
idrefAttributes = list()
if no.nodeType == Node.ELEMENT_NODE :
latt = no.attributes
for i in range(latt.length) :
att = latt.item(i)
if att.value == value :
if self.lid[att.value] != no :
idrefAttributes.append(att)
for n in no.childNodes :
if n.nodeType == Node.ELEMENT_NODE :
idrefAttributes += self._getIdrefs(n,value)
else :
pass
return idrefAttributes
def _getDTD(self, file) :
if _existFile(file) :
f = open(file,'r')
dtd = f.read()
f.close()
return dtd
else :
return None
def _extractDTD(self, file) :
el = re.compile(r'<!ELEMENT (?P<elementname>[\w\-\:\_]+) (?P<description>.*)\s*>')
att = re.compile(r'<!ATTLIST (?P<elementname>[\w\-\:\_]+) (?P<attributs>.*)\s*>')
att2 = re.compile(r'(?P<attname>[\w\-\:\_]+) (?P<def>.*?) (?P<status>#[\w\-\:\_]+|[\"\'].*?[\"\'])')
comment = re.compile(r'<!-- \.*? -->')
dtd = self._getDTD(file).replace('\n',' ').replace('\t',' ')
cp = re.compile(r'<.*?>')
liste_elem = dict()
for item in cp.findall(dtd) :
cmnt = comment.match(item)
if cmnt is not None :
pass
else :
grp = el.match(item)
if grp is not None :
nomElem = grp.group('elementname').strip('\t \n')
liste_elem[nomElem] = dict()
else :
grp = att.match(item)
if grp is not None :
nomElem = grp.group('elementname')
for (nom, definition, status) in att2.findall(grp.group('attributs')) :
nomAtt = nom.strip('\t \n')
definition = definition.strip('\t \n')
status = status.strip('\t \n')
liste_elem[nomElem][nomAtt] = (definition, status.replace("'",'').replace('"',''))
return liste_elem
def _enrichNode(self, node, le) :
if node.nodeType == Node.ELEMENT_NODE :
if node.tagName in le :
la = le[node.tagName]
for (att, (definition, status)) in la.items() :
if definition == 'ID' :
nid = node.getAttribute(att)
self.lid[nid] = node
if node.hasAttribute(att) :
pass
else :
if '#' not in status :
node.setAttribute(att,status)
latt = node.attributes
for i in range(latt.length) :
att = latt.item(i)
if att.name.upper() == 'XML:ID' and att.value not in self.lid :
self.lid[att.value] = node
for n in node.childNodes :
self._enrichNode(n,le)
if ( __name__ == "__main__"):
cine = DOMCompanion()
cine.parse("semaine10.xml", True)
print(cine.doc.toxml())
print(cine.getElementById('Ka'))
print(cine.toLighter().toxml())
print(cine.getAttributsByIdref('Ka'))
Classes
class DOMCompanion (doc=None)
-
Functions to improve xml.dom.minidom tools in Python.
class constructor.
Parameters
doc
:Node.DOCUMENT_NODE
, optional- DOM structure
Notes
The DOM is also enriched with default attributes if a DTD is specified
Expand source code
class DOMCompanion : """ Functions to improve xml.dom.minidom tools in Python. """ # =========================================================================================== def __init__(self, doc = None) : """ class constructor. Parameters ---------- doc : Node.DOCUMENT_NODE, optional DOM structure Notes ----- The DOM is also enriched with default attributes if a DTD is specified """ self.doc = doc """ the DOM structure : Node.DOCUMENT_NODE """ self.documentElement = None """ equivalent to doc.documentElement : Node.ELEMENT_NODE """ self.lid = dict() if doc is not None : self.documentElement = doc.documentElement self.enrichXML() # =========================================================================================== def parse(self, file, validate = False): """ to load an XML file Parameters ---------- file : str file that contains the XML file to load validate : boolean, optional flag to validate the XML file if it contains a Doctype section Returns ------- boolean the DOM is valid or not according to the specified DTD True if there is no DTD See Also -------- `DOMCompanion.validate` Notes ----- if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html """ if _existFile(file) : self.doc = parse(file) self.documentElement = self.doc.documentElement if validate : if self.validate() : return True else : self._enrichXML() return False else : self._enrichXML() return True # =========================================================================================== def parseString(self, xml, validate = False): """ to load an XML string Parameters ---------- xml : str the string that contains the XML validate : boolean, optional flag to validate the XML file if it contains a Doctype section Returns ------- boolean the DOM is valid or not according to the specified DTD True if there is no DTD See Also -------- `DOMCompanion.validate` Notes ----- if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html """ self.doc = parseString(xml) self.documentElement = self.doc.documentElement if validate : if self.validate() : return True else : self._enrichXML() return False else : self._enrichXML() return True # =========================================================================================== def getElementsByTagName(self, name) : """ the DOM getElementsByTagName Parameters ---------- name : str the Element to find in the DOM Returns ------- NodeList or None a list of elements or None """ if self.doc is not None : return self.doc.getElementsByTagName(name) else: return None # =========================================================================================== def getElementById(self, id) : """ to retrieve an element by its ID Parameters ---------- id : str the ID of the element to find Returns ------- Node.ELEMENT_NODE or None the element or None """ if id in self.lid.keys() : return self.lid[id] else : return None # =========================================================================================== def getAttributsByIdref(self, id) : """ to retrieve IDREF attributs with an ID Parameters ---------- id : str the ID to looking for Returns ------- List(Attr) the attributs """ # if id in self.lid.keys() : # return self.lid[id] # else : return self._getIdrefs(self.doc.documentElement, id) # =========================================================================================== def toLighter(self, del_spaces = True, del_comments = True, del_pi = True) : """ to suppress text nodes (with only separators), processing instructions and/or comments Parameters ---------- del_spaces : boolean, optional to suppress blank nodes (with only newline, tabulation et space caracters) del_comments : boolean, optional to suppress comment nodes del_spaces : boolean, optional to suppress processing instruction nodes Returns ------- DOMCompaniom itself """ if self.doc is not None : self._purgeDOM(self.doc, del_spaces, del_comments, del_pi) return self # =========================================================================================== def validate(self) : """ to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document. Returns ------- boolean the DOM is valid or not according to the specified DTD """ if self.doc is not None : parser = etree.XMLParser(recover=True, strip_cdata=True) tree = etree.XML(self.doc.toxml(), parser) dtdFile = self._getDTDFile() if dtdFile is not None : if _existFile(dtdFile) : dtd = etree.DTD(dtdFile) if dtd.validate(tree) : self._enrichXML() return True else : print(dtd.error_log.filter_from_errors()[0]) return False else : print('Unable to find the DTD file ',dtdFile) return False else: self._enrichXML() return True else : return False # =========================================================================================== def toxml(self) : """ produce XML string Returns ------- str the XML string Notes ----- See https://docs.python.org/3/library/xml.dom.minidom.html """ return self.doc.toxml() # =========================================================================================== def toprettyxml(self,indent="\t", newl="\n", encoding=None, standalone=None) : """ produce pretty-printed version of the XML string Returns ------- str the XML string Notes ----- See https://docs.python.org/3/library/xml.dom.minidom.html """ return self.doc.toprettyxml(ident, newl, encoding, standalone) # =========================================================================================== ##################################### ########## private methods ########## ##################################### def _getDTDFile(self) : if self.doc.doctype is not None : if self.doc.doctype.systemId is not None : return self.doc.doctype.systemId else : return None else : return None def _enrichXML(self) : if self.doc is not None : self.lid = dict() dtdFile = self._getDTDFile() if dtdFile is not None : if _existFile(dtdFile) : le = self._extractDTD(dtdFile) self._enrichNode(self.doc.documentElement, le) else : print('Unable ti find the DTD file ',dtdFile) else : self._enrichNode(self.doc.documentElement, dict()) def _purgeDOM(self, no, del_spaces, del_comments, del_pi) : if no.nodeType in [Node.ELEMENT_NODE, Node.DOCUMENT_NODE] : toDel = [] for n in no.childNodes : if del_spaces and n.nodeType == Node.TEXT_NODE and n.data.strip('\t \n') == '' : toDel.append(n) elif del_comments and n.nodeType == Node.COMMENT_NODE : toDel.append(n) elif del_pi and n.nodeType == Node.PROCESSING_INSTRUCTION_NODE : toDel.append(n) elif n.nodeType == Node.ELEMENT_NODE : self._purgeDOM(n,del_spaces,del_comments, del_pi) for n in toDel : no.removeChild(n) elif no.nodeType == Node.DOCUMENT_TYPE_NODE : pass else : pass return no def _getIdrefs(self, no, value) : idrefAttributes = list() if no.nodeType == Node.ELEMENT_NODE : latt = no.attributes for i in range(latt.length) : att = latt.item(i) if att.value == value : if self.lid[att.value] != no : idrefAttributes.append(att) for n in no.childNodes : if n.nodeType == Node.ELEMENT_NODE : idrefAttributes += self._getIdrefs(n,value) else : pass return idrefAttributes def _getDTD(self, file) : if _existFile(file) : f = open(file,'r') dtd = f.read() f.close() return dtd else : return None def _extractDTD(self, file) : el = re.compile(r'<!ELEMENT (?P<elementname>[\w\-\:\_]+) (?P<description>.*)\s*>') att = re.compile(r'<!ATTLIST (?P<elementname>[\w\-\:\_]+) (?P<attributs>.*)\s*>') att2 = re.compile(r'(?P<attname>[\w\-\:\_]+) (?P<def>.*?) (?P<status>#[\w\-\:\_]+|[\"\'].*?[\"\'])') comment = re.compile(r'<!-- \.*? -->') dtd = self._getDTD(file).replace('\n',' ').replace('\t',' ') cp = re.compile(r'<.*?>') liste_elem = dict() for item in cp.findall(dtd) : cmnt = comment.match(item) if cmnt is not None : pass else : grp = el.match(item) if grp is not None : nomElem = grp.group('elementname').strip('\t \n') liste_elem[nomElem] = dict() else : grp = att.match(item) if grp is not None : nomElem = grp.group('elementname') for (nom, definition, status) in att2.findall(grp.group('attributs')) : nomAtt = nom.strip('\t \n') definition = definition.strip('\t \n') status = status.strip('\t \n') liste_elem[nomElem][nomAtt] = (definition, status.replace("'",'').replace('"','')) return liste_elem def _enrichNode(self, node, le) : if node.nodeType == Node.ELEMENT_NODE : if node.tagName in le : la = le[node.tagName] for (att, (definition, status)) in la.items() : if definition == 'ID' : nid = node.getAttribute(att) self.lid[nid] = node if node.hasAttribute(att) : pass else : if '#' not in status : node.setAttribute(att,status) latt = node.attributes for i in range(latt.length) : att = latt.item(i) if att.name.upper() == 'XML:ID' and att.value not in self.lid : self.lid[att.value] = node for n in node.childNodes : self._enrichNode(n,le)
Instance variables
var doc
-
the DOM structure : Node.DOCUMENT_NODE
var documentElement
-
equivalent to doc.documentElement : Node.ELEMENT_NODE
Methods
def getAttributsByIdref(self, id)
-
to retrieve IDREF attributs with an ID
Parameters
id
:str
- the ID to looking for
Returns
List(Attr)
- the attributs
Expand source code
def getAttributsByIdref(self, id) : """ to retrieve IDREF attributs with an ID Parameters ---------- id : str the ID to looking for Returns ------- List(Attr) the attributs """ # if id in self.lid.keys() : # return self.lid[id] # else : return self._getIdrefs(self.doc.documentElement, id)
def getElementById(self, id)
-
to retrieve an element by its ID
Parameters
id
:str
- the ID of the element to find
Returns
Node.ELEMENT_NODE
orNone
- the element or None
Expand source code
def getElementById(self, id) : """ to retrieve an element by its ID Parameters ---------- id : str the ID of the element to find Returns ------- Node.ELEMENT_NODE or None the element or None """ if id in self.lid.keys() : return self.lid[id] else : return None
def getElementsByTagName(self, name)
-
the DOM getElementsByTagName
Parameters
name
:str
- the Element to find in the DOM
Returns
NodeList
orNone
- a list of elements or None
Expand source code
def getElementsByTagName(self, name) : """ the DOM getElementsByTagName Parameters ---------- name : str the Element to find in the DOM Returns ------- NodeList or None a list of elements or None """ if self.doc is not None : return self.doc.getElementsByTagName(name) else: return None
def parse(self, file, validate=False)
-
to load an XML file
Parameters
file
:str
- file that contains the XML file to load
validate
:boolean
, optional- flag to validate the XML file if it contains a Doctype section
Returns
boolean
- the DOM is valid or not according to the specified DTD True if there is no DTD
See Also
Notes
if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html
Expand source code
def parse(self, file, validate = False): """ to load an XML file Parameters ---------- file : str file that contains the XML file to load validate : boolean, optional flag to validate the XML file if it contains a Doctype section Returns ------- boolean the DOM is valid or not according to the specified DTD True if there is no DTD See Also -------- `DOMCompanion.validate` Notes ----- if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html """ if _existFile(file) : self.doc = parse(file) self.documentElement = self.doc.documentElement if validate : if self.validate() : return True else : self._enrichXML() return False else : self._enrichXML() return True
def parseString(self, xml, validate=False)
-
to load an XML string
Parameters
xml
:str
- the string that contains the XML
validate
:boolean
, optional- flag to validate the XML file if it contains a Doctype section
Returns
boolean
- the DOM is valid or not according to the specified DTD True if there is no DTD
See Also
Notes
if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html
Expand source code
def parseString(self, xml, validate = False): """ to load an XML string Parameters ---------- xml : str the string that contains the XML validate : boolean, optional flag to validate the XML file if it contains a Doctype section Returns ------- boolean the DOM is valid or not according to the specified DTD True if there is no DTD See Also -------- `DOMCompanion.validate` Notes ----- if a DTD is specified, uses it to add default attributes and to collect IDs See https://docs.python.org/3/library/xml.dom.minidom.html """ self.doc = parseString(xml) self.documentElement = self.doc.documentElement if validate : if self.validate() : return True else : self._enrichXML() return False else : self._enrichXML() return True
def toLighter(self, del_spaces=True, del_comments=True, del_pi=True)
-
to suppress text nodes (with only separators), processing instructions and/or comments
Parameters
del_spaces
:boolean
, optional- to suppress blank nodes (with only newline, tabulation et space caracters)
del_comments
:boolean
, optional- to suppress comment nodes
del_spaces
:boolean
, optional- to suppress processing instruction nodes
Returns
DOMCompaniom
- itself
Expand source code
def toLighter(self, del_spaces = True, del_comments = True, del_pi = True) : """ to suppress text nodes (with only separators), processing instructions and/or comments Parameters ---------- del_spaces : boolean, optional to suppress blank nodes (with only newline, tabulation et space caracters) del_comments : boolean, optional to suppress comment nodes del_spaces : boolean, optional to suppress processing instruction nodes Returns ------- DOMCompaniom itself """ if self.doc is not None : self._purgeDOM(self.doc, del_spaces, del_comments, del_pi) return self
def toprettyxml(self, indent='\t', newl='\n', encoding=None, standalone=None)
-
produce pretty-printed version of the XML string
Returns
str
- the XML string
Notes
Expand source code
def toprettyxml(self,indent="\t", newl="\n", encoding=None, standalone=None) : """ produce pretty-printed version of the XML string Returns ------- str the XML string Notes ----- See https://docs.python.org/3/library/xml.dom.minidom.html """ return self.doc.toprettyxml(ident, newl, encoding, standalone)
def toxml(self)
-
produce XML string
Returns
str
- the XML string
Notes
Expand source code
def toxml(self) : """ produce XML string Returns ------- str the XML string Notes ----- See https://docs.python.org/3/library/xml.dom.minidom.html """ return self.doc.toxml()
def validate(self)
-
to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document.
Returns
boolean
- the DOM is valid or not according to the specified DTD
Expand source code
def validate(self) : """ to validate the XML according its DTD (enrich it too). It uses lxml module to validate the XML document. Returns ------- boolean the DOM is valid or not according to the specified DTD """ if self.doc is not None : parser = etree.XMLParser(recover=True, strip_cdata=True) tree = etree.XML(self.doc.toxml(), parser) dtdFile = self._getDTDFile() if dtdFile is not None : if _existFile(dtdFile) : dtd = etree.DTD(dtdFile) if dtd.validate(tree) : self._enrichXML() return True else : print(dtd.error_log.filter_from_errors()[0]) return False else : print('Unable to find the DTD file ',dtdFile) return False else: self._enrichXML() return True else : return False