caselawclient.content_hash
The content hash is the SHA256 hash of the judgment text with all whitespace removed. This is intended to confirm that various processing has not changed the content of the judgment, whilst allowing variations in the XML which might not allow for preservation of whitespace.
The canonical version of this hashing function is in the parser: https://github.com/nationalarchives/tna-judgments-parser/blob/main/src/akn/SHA256.cs
1""" 2The content hash is the SHA256 hash of the judgment text with all whitespace removed. 3This is intended to confirm that various processing has not changed the content of 4the judgment, whilst allowing variations in the XML which might not allow for 5preservation of whitespace. 6 7The canonical version of this hashing function is in the parser: 8https://github.com/nationalarchives/tna-judgments-parser/blob/main/src/akn/SHA256.cs 9""" 10 11import re 12from hashlib import sha256 13 14import lxml.etree 15 16from .errors import InvalidContentHashError 17 18 19def get_hashable_text(doc: bytes) -> bytes: 20 """Extract the text (as UTF-8 bytes) that would be hashed""" 21 root = lxml.etree.fromstring(doc) 22 metadatas = root.xpath( 23 "//akn:meta", 24 namespaces={"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"}, 25 ) 26 for metadata in metadatas: # there should be no more than one, but handle zero case gracefully 27 metadata.getparent().remove(metadata) 28 text = "".join(root.itertext()) 29 spaceless = re.sub(r"\s", "", text) 30 return spaceless.encode("utf-8") 31 32 33def get_hash_from_document(doc: bytes) -> str: 34 """Get the content hash of an XML document from its contents""" 35 return sha256(get_hashable_text(doc)).hexdigest() 36 37 38def get_hash_from_tag(doc: bytes) -> str: 39 """Get the content hash of an XML document from its uk:hash tag (if present).""" 40 root = lxml.etree.fromstring(doc) 41 try: 42 hash_from_tag = root.xpath( 43 "//uk:hash/text()", 44 namespaces={"uk": "https://caselaw.nationalarchives.gov.uk/akn"}, 45 )[0] 46 except IndexError: 47 raise InvalidContentHashError("Document did not have a content hash tag") 48 49 return str(hash_from_tag) 50 51 52def validate_content_hash(doc: bytes) -> str: 53 """Check a document's self-described content hash is the same as the hash of its content, raise an error if not""" 54 hash_from_document = get_hash_from_document(doc) 55 hash_from_tag = get_hash_from_tag(doc) 56 if hash_from_document != hash_from_tag: 57 raise InvalidContentHashError( 58 f'Hash of existing tag is "{hash_from_tag}" but the hash of the document is "{hash_from_document}"', 59 ) 60 return hash_from_document
def
get_hashable_text(doc: bytes) -> bytes:
20def get_hashable_text(doc: bytes) -> bytes: 21 """Extract the text (as UTF-8 bytes) that would be hashed""" 22 root = lxml.etree.fromstring(doc) 23 metadatas = root.xpath( 24 "//akn:meta", 25 namespaces={"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"}, 26 ) 27 for metadata in metadatas: # there should be no more than one, but handle zero case gracefully 28 metadata.getparent().remove(metadata) 29 text = "".join(root.itertext()) 30 spaceless = re.sub(r"\s", "", text) 31 return spaceless.encode("utf-8")
Extract the text (as UTF-8 bytes) that would be hashed
def
get_hash_from_document(doc: bytes) -> str:
34def get_hash_from_document(doc: bytes) -> str: 35 """Get the content hash of an XML document from its contents""" 36 return sha256(get_hashable_text(doc)).hexdigest()
Get the content hash of an XML document from its contents
def
get_hash_from_tag(doc: bytes) -> str:
39def get_hash_from_tag(doc: bytes) -> str: 40 """Get the content hash of an XML document from its uk:hash tag (if present).""" 41 root = lxml.etree.fromstring(doc) 42 try: 43 hash_from_tag = root.xpath( 44 "//uk:hash/text()", 45 namespaces={"uk": "https://caselaw.nationalarchives.gov.uk/akn"}, 46 )[0] 47 except IndexError: 48 raise InvalidContentHashError("Document did not have a content hash tag") 49 50 return str(hash_from_tag)
Get the content hash of an XML document from its uk:hash tag (if present).
def
validate_content_hash(doc: bytes) -> str:
53def validate_content_hash(doc: bytes) -> str: 54 """Check a document's self-described content hash is the same as the hash of its content, raise an error if not""" 55 hash_from_document = get_hash_from_document(doc) 56 hash_from_tag = get_hash_from_tag(doc) 57 if hash_from_document != hash_from_tag: 58 raise InvalidContentHashError( 59 f'Hash of existing tag is "{hash_from_tag}" but the hash of the document is "{hash_from_document}"', 60 ) 61 return hash_from_document
Check a document's self-described content hash is the same as the hash of its content, raise an error if not