caselawclient.content_hash

The content hash is the SHA256 hash of the judgment text with all whitespace removed. This is intended to confirm that various processing has not changed the content of the judgment, whilst allowing variations in the XML which might not allow for preservation of whitespace.

The canonical version of this hashing function is in the parser: https://github.com/nationalarchives/tna-judgments-parser/blob/main/src/akn/SHA256.cs

 1"""
 2The content hash is the SHA256 hash of the judgment text with all whitespace removed.
 3This is intended to confirm that various processing has not changed the content of
 4the judgment, whilst allowing variations in the XML which might not allow for
 5preservation of whitespace.
 6
 7The canonical version of this hashing function is in the parser:
 8https://github.com/nationalarchives/tna-judgments-parser/blob/main/src/akn/SHA256.cs
 9"""
10
11import re
12from hashlib import sha256
13
14import lxml.etree
15
16from .errors import InvalidContentHashError
17
18
19def get_hashable_text(doc: bytes) -> bytes:
20    """Extract the text (as UTF-8 bytes) that would be hashed"""
21    root = lxml.etree.fromstring(doc)
22    metadatas = root.xpath(
23        "//akn:meta",
24        namespaces={"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
25    )
26    for metadata in metadatas:  # there should be no more than one, but handle zero case gracefully
27        metadata.getparent().remove(metadata)
28    text = "".join(root.itertext())
29    spaceless = re.sub(r"\s", "", text)
30    return spaceless.encode("utf-8")
31
32
33def get_hash_from_document(doc: bytes) -> str:
34    """Get the content hash of an XML document from its contents"""
35    return sha256(get_hashable_text(doc)).hexdigest()
36
37
38def get_hash_from_tag(doc: bytes) -> str:
39    """Get the content hash of an XML document from its uk:hash tag (if present)."""
40    root = lxml.etree.fromstring(doc)
41    try:
42        hash_from_tag = root.xpath(
43            "//uk:hash/text()",
44            namespaces={"uk": "https://caselaw.nationalarchives.gov.uk/akn"},
45        )[0]
46    except IndexError:
47        raise InvalidContentHashError("Document did not have a content hash tag")
48
49    return str(hash_from_tag)
50
51
52def validate_content_hash(doc: bytes) -> str:
53    """Check a document's self-described content hash is the same as the hash of its content, raise an error if not"""
54    hash_from_document = get_hash_from_document(doc)
55    hash_from_tag = get_hash_from_tag(doc)
56    if hash_from_document != hash_from_tag:
57        raise InvalidContentHashError(
58            f'Hash of existing tag is "{hash_from_tag}" but the hash of the document is "{hash_from_document}"',
59        )
60    return hash_from_document
def get_hashable_text(doc: bytes) -> bytes:
20def get_hashable_text(doc: bytes) -> bytes:
21    """Extract the text (as UTF-8 bytes) that would be hashed"""
22    root = lxml.etree.fromstring(doc)
23    metadatas = root.xpath(
24        "//akn:meta",
25        namespaces={"akn": "http://docs.oasis-open.org/legaldocml/ns/akn/3.0"},
26    )
27    for metadata in metadatas:  # there should be no more than one, but handle zero case gracefully
28        metadata.getparent().remove(metadata)
29    text = "".join(root.itertext())
30    spaceless = re.sub(r"\s", "", text)
31    return spaceless.encode("utf-8")

Extract the text (as UTF-8 bytes) that would be hashed

def get_hash_from_document(doc: bytes) -> str:
34def get_hash_from_document(doc: bytes) -> str:
35    """Get the content hash of an XML document from its contents"""
36    return sha256(get_hashable_text(doc)).hexdigest()

Get the content hash of an XML document from its contents

def get_hash_from_tag(doc: bytes) -> str:
39def get_hash_from_tag(doc: bytes) -> str:
40    """Get the content hash of an XML document from its uk:hash tag (if present)."""
41    root = lxml.etree.fromstring(doc)
42    try:
43        hash_from_tag = root.xpath(
44            "//uk:hash/text()",
45            namespaces={"uk": "https://caselaw.nationalarchives.gov.uk/akn"},
46        )[0]
47    except IndexError:
48        raise InvalidContentHashError("Document did not have a content hash tag")
49
50    return str(hash_from_tag)

Get the content hash of an XML document from its uk:hash tag (if present).

def validate_content_hash(doc: bytes) -> str:
53def validate_content_hash(doc: bytes) -> str:
54    """Check a document's self-described content hash is the same as the hash of its content, raise an error if not"""
55    hash_from_document = get_hash_from_document(doc)
56    hash_from_tag = get_hash_from_tag(doc)
57    if hash_from_document != hash_from_tag:
58        raise InvalidContentHashError(
59            f'Hash of existing tag is "{hash_from_tag}" but the hash of the document is "{hash_from_document}"',
60        )
61    return hash_from_document

Check a document's self-described content hash is the same as the hash of its content, raise an error if not