caselawclient.models.documents.body
1import datetime 2import os 3import warnings 4from functools import cache, cached_property 5from typing import Optional 6 7import pytz 8from ds_caselaw_utils.types import CourtCode 9from lxml import etree 10from saxonche import PySaxonProcessor 11 12from caselawclient.models.utilities.dates import parse_string_date_as_utc 13from caselawclient.types import DocumentCategory 14from caselawclient.xml_helpers import DEFAULT_NAMESPACES 15 16from .xml import XML 17 18 19class UnparsableDate(Warning): 20 pass 21 22 23class DocumentBody: 24 """ 25 A class for abstracting out interactions with the body of a document. 26 """ 27 28 def __init__(self, xml_bytestring: bytes): 29 self._xml = XML(xml_bytestring=xml_bytestring) 30 """ This is an instance of the `Document.XML` class for manipulation of the XML document itself. """ 31 32 def get_xpath_match_string(self, xpath: str, namespaces: dict[str, str] = DEFAULT_NAMESPACES) -> str: 33 return self._xml.get_xpath_match_string(xpath, namespaces) 34 35 def get_xpath_match_strings(self, xpath: str, namespaces: dict[str, str] = DEFAULT_NAMESPACES) -> list[str]: 36 return self._xml.get_xpath_match_strings(xpath, namespaces) 37 38 def get_xpath_nodes(self, xpath: str, namespaces: dict[str, str] = DEFAULT_NAMESPACES) -> list[etree._Element]: 39 return self._xml.get_xpath_nodes(xpath, namespaces) 40 41 @cached_property 42 def name(self) -> str: 43 return self.get_xpath_match_string( 44 "/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRname/@value" 45 ) 46 47 @cached_property 48 def court(self) -> str: 49 return self.get_xpath_match_string("/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:court/text()") 50 51 @cached_property 52 def jurisdiction(self) -> str: 53 return self.get_xpath_match_string("/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:jurisdiction/text()") 54 55 @cached_property 56 def categories(self) -> list[DocumentCategory]: 57 xpath = "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:category" 58 nodes = self.get_xpath_nodes(xpath, DEFAULT_NAMESPACES) 59 60 categories: dict[str, DocumentCategory] = {} 61 children_map: dict[str, list[DocumentCategory]] = {} 62 63 for node in nodes: 64 name = node.text 65 if name is None or not name.strip(): 66 continue 67 68 category = DocumentCategory(name=name) 69 categories[name] = category 70 71 parent = node.get("parent") 72 73 if parent: 74 children_map.setdefault(parent, []).append(category) 75 76 for parent, subcategories in children_map.items(): 77 if parent in categories: 78 categories[parent].subcategories.extend(subcategories) 79 80 top_level_categories = [ 81 categories[name] 82 for node in nodes 83 if node.get("parent") is None 84 if (name := node.text) and name in categories 85 ] 86 87 return top_level_categories 88 89 # NOTE: Deprecated - use categories function 90 @cached_property 91 def category(self) -> Optional[str]: 92 return self.get_xpath_match_string( 93 "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:category[not(@parent)][1]/text()" 94 ) 95 96 @cached_property 97 def case_number(self) -> Optional[str]: 98 return self.get_xpath_match_string("/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:caseNumber/text()") 99 100 @property 101 def court_and_jurisdiction_identifier_string(self) -> CourtCode: 102 if self.jurisdiction != "": 103 return CourtCode("/".join((self.court, self.jurisdiction))) 104 return CourtCode(self.court) 105 106 @cached_property 107 def document_date_as_string(self) -> str: 108 return self.get_xpath_match_string( 109 "/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRdate/@date", 110 ) 111 112 @cached_property 113 def document_date_as_date(self) -> Optional[datetime.date]: 114 if not self.document_date_as_string: 115 return None 116 try: 117 return datetime.datetime.strptime( 118 self.document_date_as_string, 119 "%Y-%m-%d", 120 ).date() 121 except ValueError: 122 warnings.warn( 123 f"Unparsable date encountered: {self.document_date_as_string}", 124 UnparsableDate, 125 ) 126 return None 127 128 def get_manifestation_datetimes( 129 self, 130 name: Optional[str] = None, 131 ) -> list[datetime.datetime]: 132 name_filter = f"[@name='{name}']" if name else "" 133 iso_datetimes = self.get_xpath_match_strings( 134 f"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRManifestation/akn:FRBRdate{name_filter}/@date", 135 ) 136 137 return [parse_string_date_as_utc(event, pytz.UTC) for event in iso_datetimes] 138 139 def get_latest_manifestation_datetime( 140 self, 141 name: Optional[str] = None, 142 ) -> Optional[datetime.datetime]: 143 events = self.get_manifestation_datetimes(name) 144 if not events: 145 return None 146 return max(events) 147 148 def get_latest_manifestation_type(self) -> Optional[str]: 149 return max( 150 ( 151 (type, time) 152 for type in ["transform", "tna-enriched"] 153 if (time := self.get_latest_manifestation_datetime(type)) 154 ), 155 key=lambda x: x[1], 156 )[0] 157 158 @cached_property 159 def transformation_datetime(self) -> Optional[datetime.datetime]: 160 """When was this document successfully parsed or reparsed (date from XML)""" 161 return self.get_latest_manifestation_datetime("transform") 162 163 @cached_property 164 def enrichment_datetime(self) -> Optional[datetime.datetime]: 165 """When was this document successfully enriched (date from XML)""" 166 return self.get_latest_manifestation_datetime("tna-enriched") 167 168 @cached_property 169 def content_as_xml(self) -> str: 170 return self._xml.xml_as_string 171 172 @cached_property 173 def has_content(self) -> bool: 174 """If we do not have a word document, the XML will not contain 175 the contents of the judgment, but will have content in the header if a judgment. 176 All press summaries (which have <doc> not <judgment> tags) are assumed to have content""" 177 return bool( 178 self._xml.xml_as_tree.xpath("//akn:header[normalize-space(string(.))]", namespaces=DEFAULT_NAMESPACES) 179 or self._xml.xml_as_tree.xpath("//akn:doc", namespaces=DEFAULT_NAMESPACES) 180 ) 181 182 @cached_property 183 def has_external_data(self) -> bool: 184 """Is there data which is not present within the source document: 185 is there a spreadsheet which has populated some fields. The current implementation 186 "is there a uk:party tag" is intended as a stopgap whilst we're not importing that data.""" 187 return bool(self._xml.xml_as_tree.xpath("//uk:party", namespaces=DEFAULT_NAMESPACES)) 188 189 @cache 190 def content_html(self, image_prefix: str) -> Optional[str]: 191 """Convert the XML representation of the Document into HTML for rendering.""" 192 """This used to be called content_as_html but we have changed the parameter passed to it from the 193 domain of the assets to the path in which the assets are stored (from assets to assets/d-a1b2c3) 194 and made the image_prefix mandatory""" 195 if not self.has_content: 196 return None 197 198 html_xslt_location = os.path.join(os.path.dirname(os.path.realpath(__file__)), "transforms", "html.xsl") 199 200 with PySaxonProcessor() as proc: 201 xslt_processor = proc.new_xslt30_processor() 202 document = proc.parse_xml(xml_text=self._xml.xml_as_string) 203 204 executable = xslt_processor.compile_stylesheet(stylesheet_file=html_xslt_location) 205 206 if image_prefix: 207 executable.set_parameter("image-prefix", proc.make_string_value(image_prefix)) 208 209 return str(executable.transform_to_string(xdm_node=document)) 210 211 @cached_property 212 def failed_to_parse(self) -> bool: 213 """ 214 Did this document entirely fail to parse? 215 216 :return: `True` if there was a complete parser failure, otherwise `False` 217 """ 218 return "error" in self._xml.root_element 219 220 def apply_xslt(self, xslt_filename: str, **values: str) -> bytes: 221 return self._xml.apply_xslt(xslt_filename, **values)
Base class for warning categories.
24class DocumentBody: 25 """ 26 A class for abstracting out interactions with the body of a document. 27 """ 28 29 def __init__(self, xml_bytestring: bytes): 30 self._xml = XML(xml_bytestring=xml_bytestring) 31 """ This is an instance of the `Document.XML` class for manipulation of the XML document itself. """ 32 33 def get_xpath_match_string(self, xpath: str, namespaces: dict[str, str] = DEFAULT_NAMESPACES) -> str: 34 return self._xml.get_xpath_match_string(xpath, namespaces) 35 36 def get_xpath_match_strings(self, xpath: str, namespaces: dict[str, str] = DEFAULT_NAMESPACES) -> list[str]: 37 return self._xml.get_xpath_match_strings(xpath, namespaces) 38 39 def get_xpath_nodes(self, xpath: str, namespaces: dict[str, str] = DEFAULT_NAMESPACES) -> list[etree._Element]: 40 return self._xml.get_xpath_nodes(xpath, namespaces) 41 42 @cached_property 43 def name(self) -> str: 44 return self.get_xpath_match_string( 45 "/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRname/@value" 46 ) 47 48 @cached_property 49 def court(self) -> str: 50 return self.get_xpath_match_string("/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:court/text()") 51 52 @cached_property 53 def jurisdiction(self) -> str: 54 return self.get_xpath_match_string("/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:jurisdiction/text()") 55 56 @cached_property 57 def categories(self) -> list[DocumentCategory]: 58 xpath = "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:category" 59 nodes = self.get_xpath_nodes(xpath, DEFAULT_NAMESPACES) 60 61 categories: dict[str, DocumentCategory] = {} 62 children_map: dict[str, list[DocumentCategory]] = {} 63 64 for node in nodes: 65 name = node.text 66 if name is None or not name.strip(): 67 continue 68 69 category = DocumentCategory(name=name) 70 categories[name] = category 71 72 parent = node.get("parent") 73 74 if parent: 75 children_map.setdefault(parent, []).append(category) 76 77 for parent, subcategories in children_map.items(): 78 if parent in categories: 79 categories[parent].subcategories.extend(subcategories) 80 81 top_level_categories = [ 82 categories[name] 83 for node in nodes 84 if node.get("parent") is None 85 if (name := node.text) and name in categories 86 ] 87 88 return top_level_categories 89 90 # NOTE: Deprecated - use categories function 91 @cached_property 92 def category(self) -> Optional[str]: 93 return self.get_xpath_match_string( 94 "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:category[not(@parent)][1]/text()" 95 ) 96 97 @cached_property 98 def case_number(self) -> Optional[str]: 99 return self.get_xpath_match_string("/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:caseNumber/text()") 100 101 @property 102 def court_and_jurisdiction_identifier_string(self) -> CourtCode: 103 if self.jurisdiction != "": 104 return CourtCode("/".join((self.court, self.jurisdiction))) 105 return CourtCode(self.court) 106 107 @cached_property 108 def document_date_as_string(self) -> str: 109 return self.get_xpath_match_string( 110 "/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRWork/akn:FRBRdate/@date", 111 ) 112 113 @cached_property 114 def document_date_as_date(self) -> Optional[datetime.date]: 115 if not self.document_date_as_string: 116 return None 117 try: 118 return datetime.datetime.strptime( 119 self.document_date_as_string, 120 "%Y-%m-%d", 121 ).date() 122 except ValueError: 123 warnings.warn( 124 f"Unparsable date encountered: {self.document_date_as_string}", 125 UnparsableDate, 126 ) 127 return None 128 129 def get_manifestation_datetimes( 130 self, 131 name: Optional[str] = None, 132 ) -> list[datetime.datetime]: 133 name_filter = f"[@name='{name}']" if name else "" 134 iso_datetimes = self.get_xpath_match_strings( 135 f"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRManifestation/akn:FRBRdate{name_filter}/@date", 136 ) 137 138 return [parse_string_date_as_utc(event, pytz.UTC) for event in iso_datetimes] 139 140 def get_latest_manifestation_datetime( 141 self, 142 name: Optional[str] = None, 143 ) -> Optional[datetime.datetime]: 144 events = self.get_manifestation_datetimes(name) 145 if not events: 146 return None 147 return max(events) 148 149 def get_latest_manifestation_type(self) -> Optional[str]: 150 return max( 151 ( 152 (type, time) 153 for type in ["transform", "tna-enriched"] 154 if (time := self.get_latest_manifestation_datetime(type)) 155 ), 156 key=lambda x: x[1], 157 )[0] 158 159 @cached_property 160 def transformation_datetime(self) -> Optional[datetime.datetime]: 161 """When was this document successfully parsed or reparsed (date from XML)""" 162 return self.get_latest_manifestation_datetime("transform") 163 164 @cached_property 165 def enrichment_datetime(self) -> Optional[datetime.datetime]: 166 """When was this document successfully enriched (date from XML)""" 167 return self.get_latest_manifestation_datetime("tna-enriched") 168 169 @cached_property 170 def content_as_xml(self) -> str: 171 return self._xml.xml_as_string 172 173 @cached_property 174 def has_content(self) -> bool: 175 """If we do not have a word document, the XML will not contain 176 the contents of the judgment, but will have content in the header if a judgment. 177 All press summaries (which have <doc> not <judgment> tags) are assumed to have content""" 178 return bool( 179 self._xml.xml_as_tree.xpath("//akn:header[normalize-space(string(.))]", namespaces=DEFAULT_NAMESPACES) 180 or self._xml.xml_as_tree.xpath("//akn:doc", namespaces=DEFAULT_NAMESPACES) 181 ) 182 183 @cached_property 184 def has_external_data(self) -> bool: 185 """Is there data which is not present within the source document: 186 is there a spreadsheet which has populated some fields. The current implementation 187 "is there a uk:party tag" is intended as a stopgap whilst we're not importing that data.""" 188 return bool(self._xml.xml_as_tree.xpath("//uk:party", namespaces=DEFAULT_NAMESPACES)) 189 190 @cache 191 def content_html(self, image_prefix: str) -> Optional[str]: 192 """Convert the XML representation of the Document into HTML for rendering.""" 193 """This used to be called content_as_html but we have changed the parameter passed to it from the 194 domain of the assets to the path in which the assets are stored (from assets to assets/d-a1b2c3) 195 and made the image_prefix mandatory""" 196 if not self.has_content: 197 return None 198 199 html_xslt_location = os.path.join(os.path.dirname(os.path.realpath(__file__)), "transforms", "html.xsl") 200 201 with PySaxonProcessor() as proc: 202 xslt_processor = proc.new_xslt30_processor() 203 document = proc.parse_xml(xml_text=self._xml.xml_as_string) 204 205 executable = xslt_processor.compile_stylesheet(stylesheet_file=html_xslt_location) 206 207 if image_prefix: 208 executable.set_parameter("image-prefix", proc.make_string_value(image_prefix)) 209 210 return str(executable.transform_to_string(xdm_node=document)) 211 212 @cached_property 213 def failed_to_parse(self) -> bool: 214 """ 215 Did this document entirely fail to parse? 216 217 :return: `True` if there was a complete parser failure, otherwise `False` 218 """ 219 return "error" in self._xml.root_element 220 221 def apply_xslt(self, xslt_filename: str, **values: str) -> bytes: 222 return self._xml.apply_xslt(xslt_filename, **values)
A class for abstracting out interactions with the body of a document.
56 @cached_property 57 def categories(self) -> list[DocumentCategory]: 58 xpath = "/akn:akomaNtoso/akn:*/akn:meta/akn:proprietary/uk:category" 59 nodes = self.get_xpath_nodes(xpath, DEFAULT_NAMESPACES) 60 61 categories: dict[str, DocumentCategory] = {} 62 children_map: dict[str, list[DocumentCategory]] = {} 63 64 for node in nodes: 65 name = node.text 66 if name is None or not name.strip(): 67 continue 68 69 category = DocumentCategory(name=name) 70 categories[name] = category 71 72 parent = node.get("parent") 73 74 if parent: 75 children_map.setdefault(parent, []).append(category) 76 77 for parent, subcategories in children_map.items(): 78 if parent in categories: 79 categories[parent].subcategories.extend(subcategories) 80 81 top_level_categories = [ 82 categories[name] 83 for node in nodes 84 if node.get("parent") is None 85 if (name := node.text) and name in categories 86 ] 87 88 return top_level_categories
113 @cached_property 114 def document_date_as_date(self) -> Optional[datetime.date]: 115 if not self.document_date_as_string: 116 return None 117 try: 118 return datetime.datetime.strptime( 119 self.document_date_as_string, 120 "%Y-%m-%d", 121 ).date() 122 except ValueError: 123 warnings.warn( 124 f"Unparsable date encountered: {self.document_date_as_string}", 125 UnparsableDate, 126 ) 127 return None
129 def get_manifestation_datetimes( 130 self, 131 name: Optional[str] = None, 132 ) -> list[datetime.datetime]: 133 name_filter = f"[@name='{name}']" if name else "" 134 iso_datetimes = self.get_xpath_match_strings( 135 f"/akn:akomaNtoso/akn:*/akn:meta/akn:identification/akn:FRBRManifestation/akn:FRBRdate{name_filter}/@date", 136 ) 137 138 return [parse_string_date_as_utc(event, pytz.UTC) for event in iso_datetimes]
159 @cached_property 160 def transformation_datetime(self) -> Optional[datetime.datetime]: 161 """When was this document successfully parsed or reparsed (date from XML)""" 162 return self.get_latest_manifestation_datetime("transform")
When was this document successfully parsed or reparsed (date from XML)
164 @cached_property 165 def enrichment_datetime(self) -> Optional[datetime.datetime]: 166 """When was this document successfully enriched (date from XML)""" 167 return self.get_latest_manifestation_datetime("tna-enriched")
When was this document successfully enriched (date from XML)
173 @cached_property 174 def has_content(self) -> bool: 175 """If we do not have a word document, the XML will not contain 176 the contents of the judgment, but will have content in the header if a judgment. 177 All press summaries (which have <doc> not <judgment> tags) are assumed to have content""" 178 return bool( 179 self._xml.xml_as_tree.xpath("//akn:header[normalize-space(string(.))]", namespaces=DEFAULT_NAMESPACES) 180 or self._xml.xml_as_tree.xpath("//akn:doc", namespaces=DEFAULT_NAMESPACES) 181 )
If we do not have a word document, the XML will not contain
the contents of the judgment, but will have content in the header if a judgment.
All press summaries (which have
183 @cached_property 184 def has_external_data(self) -> bool: 185 """Is there data which is not present within the source document: 186 is there a spreadsheet which has populated some fields. The current implementation 187 "is there a uk:party tag" is intended as a stopgap whilst we're not importing that data.""" 188 return bool(self._xml.xml_as_tree.xpath("//uk:party", namespaces=DEFAULT_NAMESPACES))
Is there data which is not present within the source document: is there a spreadsheet which has populated some fields. The current implementation "is there a uk:party tag" is intended as a stopgap whilst we're not importing that data.
190 @cache 191 def content_html(self, image_prefix: str) -> Optional[str]: 192 """Convert the XML representation of the Document into HTML for rendering.""" 193 """This used to be called content_as_html but we have changed the parameter passed to it from the 194 domain of the assets to the path in which the assets are stored (from assets to assets/d-a1b2c3) 195 and made the image_prefix mandatory""" 196 if not self.has_content: 197 return None 198 199 html_xslt_location = os.path.join(os.path.dirname(os.path.realpath(__file__)), "transforms", "html.xsl") 200 201 with PySaxonProcessor() as proc: 202 xslt_processor = proc.new_xslt30_processor() 203 document = proc.parse_xml(xml_text=self._xml.xml_as_string) 204 205 executable = xslt_processor.compile_stylesheet(stylesheet_file=html_xslt_location) 206 207 if image_prefix: 208 executable.set_parameter("image-prefix", proc.make_string_value(image_prefix)) 209 210 return str(executable.transform_to_string(xdm_node=document))
Convert the XML representation of the Document into HTML for rendering.
212 @cached_property 213 def failed_to_parse(self) -> bool: 214 """ 215 Did this document entirely fail to parse? 216 217 :return: `True` if there was a complete parser failure, otherwise `False` 218 """ 219 return "error" in self._xml.root_element
Did this document entirely fail to parse?
Returns
Trueif there was a complete parser failure, otherwiseFalse