caselawclient.models.documents
1import datetime 2import os 3import warnings 4from functools import cached_property 5from typing import TYPE_CHECKING, Any, Optional 6 7from ds_caselaw_utils import courts 8from ds_caselaw_utils.courts import CourtNotFoundException 9from ds_caselaw_utils.types import NeutralCitationString 10from pydantic import TypeAdapter 11from requests_toolbelt.multipart import decoder 12 13import caselawclient.models.documents.comparison as comparison 14from caselawclient.errors import ( 15 DocumentNotFoundError, 16 NotSupportedOnVersion, 17 OnlySupportedOnVersion, 18) 19from caselawclient.identifier_resolution import IdentifierResolutions 20from caselawclient.models.documents.versions import AnnotationDataDict 21from caselawclient.models.identifiers import Identifier 22from caselawclient.models.identifiers.exceptions import IdentifierValidationException 23from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema 24from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree 25from caselawclient.models.utilities import VersionsDict, extract_version, render_versions 26from caselawclient.models.utilities.aws import ( 27 ParserInstructionsDict, 28 announce_document_event, 29 are_unpublished_assets_clean, 30 check_docx_exists, 31 delete_documents_from_private_bucket, 32 generate_docx_url, 33 generate_pdf_url, 34 publish_documents, 35 request_parse, 36 unpublish_documents, 37) 38from caselawclient.types import DocumentURIString, SuccessFailureMessageTuple 39 40from .body import DocumentBody 41from .exceptions import CannotEnrichUnenrichableDocument, CannotPublishUnpublishableDocument, DocumentNotSafeForDeletion 42from .statuses import DOCUMENT_STATUS_HOLD, DOCUMENT_STATUS_IN_PROGRESS, DOCUMENT_STATUS_NEW, DOCUMENT_STATUS_PUBLISHED 43 44MINIMUM_ENRICHMENT_TIME = datetime.timedelta(minutes=20) 45 46 47class GatewayTimeoutGettingHTMLWithQuery(RuntimeWarning): 48 pass 49 50 51DOCUMENT_COLLECTION_URI_JUDGMENT = "judgment" 52DOCUMENT_COLLECTION_URI_PRESS_SUMMARY = "press-summary" 53 54if TYPE_CHECKING: 55 from caselawclient.Client import MarklogicApiClient 56 57 58class Document: 59 """ 60 A base class from which all other document types are extensions. This class includes the essential methods for 61 retrieving and manipulating a document within MarkLogic. 62 """ 63 64 document_noun = "document" 65 """ The noun for a single instance of this document type. """ 66 67 document_noun_plural = "documents" 68 """ The noun for a plural of this document type. """ 69 70 _default_reparse_document_type: Optional[str] = None 71 """ The default noun to pass to the parser when reparsing given the document type if known. This is used to determine how the document should be parsed and processed.""" 72 73 type_collection_name: str 74 75 attributes_to_validate: list[tuple[str, bool, str]] = [ 76 ( 77 "is_failure", 78 False, 79 "This document failed to parse", 80 ), 81 ( 82 "is_parked", 83 False, 84 "This {document_noun} is currently parked at a temporary URI", 85 ), 86 ( 87 "is_held", 88 False, 89 "This {document_noun} is currently on hold", 90 ), 91 ( 92 "has_name", 93 True, 94 "This {document_noun} has no name", 95 ), 96 ( 97 "has_valid_court", 98 True, 99 "The court for this {document_noun} is not valid", 100 ), 101 ( 102 "has_unique_content_hash", 103 True, 104 "There is another document with identical content", 105 ), 106 ( 107 "has_only_clean_assets", 108 True, 109 "An uncleaned asset exists for this document", 110 ), 111 ] 112 """ 113 A list of tuples in the form: 114 115 ``` python 116 ( 117 attribute_name, 118 passing_condition, 119 error_message, 120 ) 121 ``` 122 123 describing attributes which should be checked in order for a document to be considered valid. 124 125 Individual document classes should extend this list where necessary to validate document type-specific attributes. 126 """ 127 128 def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None): 129 """ 130 :param uri: The URI of the document to retrieve from MarkLogic. 131 :param api_client: An instance of the API client object to handle communication with the MarkLogic server. 132 :param search_query: Optionally, a search string which should be highlighted if it appears in the document body. 133 134 :raises DocumentNotFoundError: The document does not exist within MarkLogic 135 """ 136 self.uri: DocumentURIString = uri 137 self.api_client: MarklogicApiClient = api_client 138 if not self.document_exists(): 139 raise DocumentNotFoundError(f"Document {self.uri} does not exist") 140 141 self.body: DocumentBody = DocumentBody( 142 xml_bytestring=self.api_client.get_judgment_xml_bytestring( 143 self.uri, 144 show_unpublished=True, 145 search_query=search_query, 146 ), 147 ) 148 """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """ 149 150 self._initialise_identifiers() 151 152 def __repr__(self) -> str: 153 name = self.body.name or "un-named" 154 return f"<{self.document_noun} {self.uri}: {name}>" 155 156 def document_exists(self) -> bool: 157 """Helper method to verify the existence of a document within MarkLogic. 158 159 :return: `True` if the document exists, `False` otherwise.""" 160 return self.api_client.document_exists(self.uri) 161 162 def docx_exists(self) -> bool: 163 """There is a docx in S3 private bucket for this Document""" 164 return check_docx_exists(self.uri) 165 166 def _initialise_identifiers(self) -> None: 167 """Load this document's identifiers from MarkLogic.""" 168 169 identifiers_element_as_etree = self.api_client.get_property_as_node(self.uri, "identifiers") 170 self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree) 171 172 @property 173 def best_human_identifier(self) -> Optional[Identifier]: 174 """Return the preferred identifier for the document, providing that it is considered human readable.""" 175 preferred_identifier = self.identifiers.preferred() 176 if preferred_identifier and preferred_identifier.schema.human_readable: 177 return preferred_identifier 178 return None 179 180 @property 181 def public_uri(self) -> str: 182 """ 183 :return: The absolute, public URI at which a copy of this document can be found 184 """ 185 return f"https://caselaw.nationalarchives.gov.uk/{self.slug}" 186 187 @cached_property 188 def slug(self) -> str: 189 """ 190 :return: The best public-facing URL for the judgment, which is the slug 191 of the most-preferred identifier, which should either be an NCN or fclid. 192 """ 193 preferred_identifier = self.identifiers.preferred() 194 if preferred_identifier: 195 return preferred_identifier.url_slug 196 msg = f"No preferred identifier exists for {self.uri}" 197 raise RuntimeError(msg) 198 199 @cached_property 200 def is_published(self) -> bool: 201 return self.api_client.get_published(self.uri) 202 203 @cached_property 204 def is_held(self) -> bool: 205 return self.api_client.get_property(self.uri, "editor-hold") == "true" 206 207 @cached_property 208 def is_locked(self) -> bool: 209 return self.checkout_message is not None 210 211 @cached_property 212 def checkout_message(self) -> Optional[str]: 213 return self.api_client.get_judgment_checkout_status_message(self.uri) 214 215 @cached_property 216 def source_name(self) -> str: 217 return self.api_client.get_property(self.uri, "source-name") 218 219 @cached_property 220 def source_email(self) -> str: 221 return self.api_client.get_property(self.uri, "source-email") 222 223 @cached_property 224 def consignment_reference(self) -> str: 225 return self.api_client.get_property(self.uri, "transfer-consignment-reference") 226 227 @property 228 def docx_url(self) -> str: 229 """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.""" 230 return generate_docx_url(self.uri) 231 232 @property 233 def pdf_url(self) -> str: 234 """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.""" 235 return generate_pdf_url(self.uri) 236 237 @cached_property 238 def assigned_to(self) -> str: 239 return self.api_client.get_property(self.uri, "assigned-to") 240 241 @cached_property 242 def versions(self) -> list[VersionsDict]: 243 versions_response = self.api_client.list_judgment_versions(self.uri) 244 245 try: 246 decoded_versions = decoder.MultipartDecoder.from_response(versions_response) 247 return render_versions(decoded_versions.parts) 248 except AttributeError: 249 return [] 250 251 @cached_property 252 def versions_as_documents(self) -> list["Document"]: 253 """ 254 Returns a list of `Document` subclasses corresponding to the versions of the document. The first entry is: 255 * the most recent 256 * the highest numbered 257 258 Note that this is only valid on the managed document -- a `DLS-DOCUMENTVERSION` error will occur if the document 259 this is called on is itself a version. 260 """ 261 if self.is_version: 262 raise NotSupportedOnVersion( 263 f"Cannot get versions of a version for {self.uri}", 264 ) 265 docs = [] 266 for version in self.versions: 267 doc_uri = DocumentURIString(version["uri"]) 268 docs.append(self.api_client.get_document_by_uri(doc_uri)) 269 return docs 270 271 @cached_property 272 def version_number(self) -> int: 273 """ 274 Note that the highest number is the most recent version. 275 Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version) 276 """ 277 version = extract_version(self.uri) 278 if version == 0: 279 raise OnlySupportedOnVersion( 280 f"Version number requested for {self.uri} which is not a version", 281 ) 282 return version 283 284 @cached_property 285 def is_version(self) -> bool: 286 "Is this document a potentially historic version of a document, or is it the main document itself?" 287 return extract_version(self.uri) != 0 288 289 @cached_property 290 def is_failure(self) -> bool: 291 """ 292 Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if: 293 294 - The document entirely failed to parse 295 296 :return: `True` if this document is in a 'failure' state, otherwise `False` 297 """ 298 return self.body.failed_to_parse 299 300 @cached_property 301 def is_parked(self) -> bool: 302 return "parked" in self.uri 303 304 @cached_property 305 def has_name(self) -> bool: 306 return bool(self.body.name) 307 308 @cached_property 309 def has_valid_court(self) -> bool: 310 try: 311 return bool( 312 courts.get_by_code(self.body.court_and_jurisdiction_identifier_string), 313 ) 314 except CourtNotFoundException: 315 return False 316 317 @cached_property 318 def is_publishable(self) -> bool: 319 # If there are any validation failures, there will be no messages in the list. 320 # An empty list (which is falsy) therefore means the judgment can be published safely. 321 return not self.validation_failure_messages 322 323 @cached_property 324 def first_published_datetime(self) -> Optional[datetime.datetime]: 325 """ 326 Return the database value for the date and time this document was first published. 327 328 :return: The datetime value in the database for "first published". 329 """ 330 return self.api_client.get_datetime_property(self.uri, "first_published_datetime") 331 332 @cached_property 333 def first_published_datetime_display(self) -> Optional[datetime.datetime]: 334 """ 335 Return the display value for the date and time this document was first published. 336 337 A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return `None`. This can be used alongside `has_ever_been_published` to indicate an "unknown" state. 338 339 :return: The datetime value to be displayed to end users for "first published". 340 """ 341 342 if self.first_published_datetime == datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc): 343 return None 344 345 return self.first_published_datetime 346 347 @cached_property 348 def has_ever_been_published(self) -> bool: 349 """ 350 Do we consider this document to have ever been published? 351 352 This is `True` if either the document is currently published, or if `first_published_datetime` has any value (including the sentinel value). 353 354 :return: A boolean indicating if the document has ever been published. 355 """ 356 return self.is_published or self.first_published_datetime is not None 357 358 @cached_property 359 def validation_failure_messages(self) -> list[str]: 360 exception_list = [] 361 for function_name, pass_value, message in self.attributes_to_validate: 362 if getattr(self, function_name) != pass_value: 363 exception_list.append(message.format(document_noun=self.document_noun)) 364 return sorted(exception_list) 365 366 @cached_property 367 def annotation(self) -> str: 368 return self.api_client.get_version_annotation(self.uri) 369 370 @cached_property 371 def structured_annotation(self) -> AnnotationDataDict: 372 annotation_data_dict_loader = TypeAdapter(AnnotationDataDict) 373 374 return annotation_data_dict_loader.validate_json(self.annotation) 375 376 @cached_property 377 def has_unique_content_hash(self) -> bool: 378 """Check if the content hash of this document is unique compared to all other documents in MarkLogic.""" 379 return self.api_client.has_unique_content_hash(self.uri) 380 381 @cached_property 382 def has_only_clean_assets(self) -> bool: 383 """False if any non-tar.gz assets associated with this document have not been cleaned.""" 384 return True # TODO: Remove this once we have enabled the asset cleaning pipeline. 385 return are_unpublished_assets_clean(self.uri) 386 387 @cached_property 388 def version_created_datetime(self) -> datetime.datetime: 389 return self.api_client.get_version_created_datetime(self.uri) 390 391 @property 392 def status(self) -> str: 393 if self.is_published: 394 return DOCUMENT_STATUS_PUBLISHED 395 396 if self.is_held: 397 return DOCUMENT_STATUS_HOLD 398 399 if self.assigned_to: 400 return DOCUMENT_STATUS_IN_PROGRESS 401 402 return DOCUMENT_STATUS_NEW 403 404 def force_enrich(self) -> None: 405 """ 406 Request enrichment of the document, but do no checks 407 """ 408 now = datetime.datetime.now(datetime.timezone.utc) 409 self.api_client.set_property( 410 self.uri, 411 "last_sent_to_enrichment", 412 now.isoformat(), 413 ) 414 415 if not self.can_enrich: 416 msg = f"{self.uri} cannot be enriched" 417 raise CannotEnrichUnenrichableDocument(msg) 418 419 announce_document_event( 420 uri=self.uri, 421 status="enrich", 422 enrich=True, 423 ) 424 425 def enrich(self, even_if_recent: bool = False, accept_failures: bool = False) -> bool: 426 """ 427 Request enrichment of a document, if it's sensible to do so. 428 """ 429 if not (even_if_recent) and self.enriched_recently: 430 print("Enrichment not requested as document was enriched recently") 431 return False 432 433 print("Enrichment requested") 434 435 try: 436 self.force_enrich() 437 except CannotEnrichUnenrichableDocument as e: 438 if not accept_failures: 439 raise e 440 return False 441 442 return True 443 444 @cached_property 445 def enriched_recently(self) -> bool: 446 """ 447 Has this document been enriched recently? 448 """ 449 450 last_enrichment = self.body.enrichment_datetime 451 if not last_enrichment: 452 return False 453 454 now = datetime.datetime.now(tz=datetime.timezone.utc) 455 456 return now - last_enrichment < MINIMUM_ENRICHMENT_TIME 457 458 @cached_property 459 def validates_against_schema(self) -> bool: 460 """ 461 Does the document validate against the most recent schema? 462 """ 463 return self.api_client.validate_document(self.uri) 464 465 def assign_fclid_if_missing(self) -> Optional[FindCaseLawIdentifier]: 466 """If the document does not have an FCLID already, mint a new one and save it.""" 467 if len(self.identifiers.of_type(FindCaseLawIdentifier)) == 0: 468 document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client) 469 self.identifiers.add(document_fclid) 470 self.save_identifiers() 471 return document_fclid 472 473 return None 474 475 def publish(self) -> None: 476 """ 477 Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state. 478 479 :raises CannotPublishUnpublishableDocument: This document has not passed the checks in `is_publishable`, and as 480 such cannot be published. 481 """ 482 if not self.is_publishable: 483 raise CannotPublishUnpublishableDocument 484 485 ## Make sure the document has an FCLID 486 self.assign_fclid_if_missing() 487 488 ## Copy the document assets into the appropriate place in S3 489 publish_documents(self.uri) 490 491 ## Set the fact the document is published 492 self.api_client.set_published(self.uri, True) 493 494 ## If necessary, set the first published date 495 if not self.first_published_datetime: 496 self.api_client.set_datetime_property( 497 self.uri, "first_published_datetime", datetime.datetime.now(datetime.timezone.utc) 498 ) 499 500 ## Announce the publication on the event bus 501 announce_document_event( 502 uri=self.uri, 503 status="publish", 504 ) 505 506 ## Send the document off for enrichment, but accept if we can't for any reason 507 self.enrich(accept_failures=True) 508 509 def unpublish(self) -> None: 510 self.api_client.break_checkout(self.uri) 511 unpublish_documents(self.uri) 512 self.api_client.set_published(self.uri, False) 513 announce_document_event( 514 uri=self.uri, 515 status="unpublish", 516 ) 517 518 def hold(self) -> None: 519 self.api_client.set_property(self.uri, "editor-hold", "true") 520 521 def unhold(self) -> None: 522 self.api_client.set_property(self.uri, "editor-hold", "false") 523 524 @cached_property 525 def safe_to_delete(self) -> bool: 526 """ 527 Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available. 528 529 :return: If the document is safe to be deleted 530 """ 531 532 return not self.is_published 533 534 def delete(self) -> None: 535 """ 536 Deletes this document from MarkLogic and any resources from AWS. 537 """ 538 539 if self.safe_to_delete: 540 self.api_client.delete_judgment(self.uri) 541 delete_documents_from_private_bucket(self.uri) 542 else: 543 raise DocumentNotSafeForDeletion 544 545 def move(self, new_citation: NeutralCitationString) -> None: 546 self.api_client.update_document_uri(self.uri, new_citation) 547 548 def force_reparse(self) -> None: 549 "Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI" 550 551 now = datetime.datetime.now(datetime.timezone.utc) 552 self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat()) 553 554 checked_date: Optional[str] = ( 555 self.body.document_date_as_date.isoformat() 556 if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1) 557 else None 558 ) 559 560 # the keys of parser_instructions should exactly match the parser output 561 # in the *-metadata.json files by the parser. Whilst typically empty 562 # values are "" from the API, we should pass None instead in this case. 563 564 parser_instructions: ParserInstructionsDict = { 565 "metadata": { 566 "name": self.body.name or None, 567 "cite": None, 568 "court": self.body.court or None, 569 "date": checked_date, 570 "uri": self.uri, 571 } 572 } 573 574 if self._default_reparse_document_type: 575 parser_instructions["documentType"] = self._default_reparse_document_type 576 577 ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata 578 if hasattr(self, "neutral_citation"): 579 parser_instructions["metadata"]["cite"] = self.neutral_citation 580 581 request_parse( 582 uri=self.uri, 583 reference=self.consignment_reference, 584 parser_instructions=parser_instructions, 585 ) 586 587 def reparse(self) -> bool: 588 # note that we set 'last_sent_to_parser' even if we can't send it to the parser 589 # it means 'last tried to reparse' much more consistently. 590 now = datetime.datetime.now(datetime.timezone.utc) 591 self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat()) 592 if self.can_reparse: 593 self.force_reparse() 594 return True 595 return False 596 597 @cached_property 598 def can_reparse(self) -> bool: 599 """ 600 Is it sensible to reparse this document? 601 """ 602 return self.docx_exists() and not self.body.has_external_data 603 604 @cached_property 605 def can_enrich(self) -> bool: 606 """ 607 Is it possible to enrich this document? 608 """ 609 return self.body.has_content and not self.body.has_external_data 610 611 def validate_identifiers(self) -> SuccessFailureMessageTuple: 612 return self.identifiers.perform_all_validations(document_type=type(self), api_client=self.api_client) 613 614 def save_identifiers(self) -> None: 615 """Validate the identifiers, and if the validation passes save them to MarkLogic""" 616 validations = self.validate_identifiers() 617 if validations.success is True: 618 self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree) 619 else: 620 raise IdentifierValidationException( 621 "Unable to save identifiers; validation constraints not met: " + ", ".join(validations.messages) 622 ) 623 624 def __getattr__(self, name: str) -> Any: 625 warnings.warn(f"{name} no longer exists on Document, using Document.body instead", DeprecationWarning) 626 try: 627 return getattr(self.body, name) 628 except Exception: 629 raise AttributeError(f"Neither 'Document' nor 'DocumentBody' objects have an attribute '{name}'") 630 631 def linked_document_resolutions(self, namespaces: list[str], only_published: bool = True) -> IdentifierResolutions: 632 """Get document resolutions which share the same neutral citation as this document.""" 633 if not hasattr(self, "neutral_citation") or not self.neutral_citation: 634 return IdentifierResolutions([]) 635 636 resolutions = self.api_client.resolve_from_identifier_value(self.neutral_citation) 637 if only_published: 638 resolutions = resolutions.published() 639 640 # only documents which aren't this one and have a right namespace 641 return IdentifierResolutions( 642 [ 643 resolution 644 for resolution in resolutions 645 if resolution.document_uri != self.uri.as_marklogic() and resolution.identifier_namespace in namespaces 646 ] 647 ) 648 649 def linked_documents(self, namespaces: list[str], only_published: bool = True) -> list["Document"]: 650 resolutions = self.linked_document_resolutions(namespaces=namespaces, only_published=only_published) 651 return [ 652 Document(resolution.document_uri.as_document_uri(), api_client=self.api_client) 653 for resolution in resolutions 654 ] 655 656 def content_as_html(self) -> str | None: 657 xlst_image_location = os.getenv("XSLT_IMAGE_LOCATION", "") 658 return self.body.content_html(f"{xlst_image_location}/{self.uri}") 659 660 def xml_with_correct_frbr(self) -> bytes: 661 """Dynamically modify FRBR uris to reflect current storage location and FCL id""" 662 fcl_identifiers = self.identifiers.of_type(FindCaseLawIdentifier) 663 work_uri = f"https://caselaw.nationalarchives.gov.uk/id/doc/{fcl_identifiers[0].value}" 664 expression_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}" 665 manifestation_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}/data.xml" 666 return self.body.apply_xslt( 667 "modify_xml_live.xsl", work_uri=work_uri, expression_uri=expression_uri, manifestation_uri=manifestation_uri 668 ) 669 670 def compare_to(self, that_doc: "Document") -> comparison.Comparison: 671 return comparison.Comparison(self, that_doc)
Base class for warnings about dubious runtime behavior.
59class Document: 60 """ 61 A base class from which all other document types are extensions. This class includes the essential methods for 62 retrieving and manipulating a document within MarkLogic. 63 """ 64 65 document_noun = "document" 66 """ The noun for a single instance of this document type. """ 67 68 document_noun_plural = "documents" 69 """ The noun for a plural of this document type. """ 70 71 _default_reparse_document_type: Optional[str] = None 72 """ The default noun to pass to the parser when reparsing given the document type if known. This is used to determine how the document should be parsed and processed.""" 73 74 type_collection_name: str 75 76 attributes_to_validate: list[tuple[str, bool, str]] = [ 77 ( 78 "is_failure", 79 False, 80 "This document failed to parse", 81 ), 82 ( 83 "is_parked", 84 False, 85 "This {document_noun} is currently parked at a temporary URI", 86 ), 87 ( 88 "is_held", 89 False, 90 "This {document_noun} is currently on hold", 91 ), 92 ( 93 "has_name", 94 True, 95 "This {document_noun} has no name", 96 ), 97 ( 98 "has_valid_court", 99 True, 100 "The court for this {document_noun} is not valid", 101 ), 102 ( 103 "has_unique_content_hash", 104 True, 105 "There is another document with identical content", 106 ), 107 ( 108 "has_only_clean_assets", 109 True, 110 "An uncleaned asset exists for this document", 111 ), 112 ] 113 """ 114 A list of tuples in the form: 115 116 ``` python 117 ( 118 attribute_name, 119 passing_condition, 120 error_message, 121 ) 122 ``` 123 124 describing attributes which should be checked in order for a document to be considered valid. 125 126 Individual document classes should extend this list where necessary to validate document type-specific attributes. 127 """ 128 129 def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None): 130 """ 131 :param uri: The URI of the document to retrieve from MarkLogic. 132 :param api_client: An instance of the API client object to handle communication with the MarkLogic server. 133 :param search_query: Optionally, a search string which should be highlighted if it appears in the document body. 134 135 :raises DocumentNotFoundError: The document does not exist within MarkLogic 136 """ 137 self.uri: DocumentURIString = uri 138 self.api_client: MarklogicApiClient = api_client 139 if not self.document_exists(): 140 raise DocumentNotFoundError(f"Document {self.uri} does not exist") 141 142 self.body: DocumentBody = DocumentBody( 143 xml_bytestring=self.api_client.get_judgment_xml_bytestring( 144 self.uri, 145 show_unpublished=True, 146 search_query=search_query, 147 ), 148 ) 149 """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """ 150 151 self._initialise_identifiers() 152 153 def __repr__(self) -> str: 154 name = self.body.name or "un-named" 155 return f"<{self.document_noun} {self.uri}: {name}>" 156 157 def document_exists(self) -> bool: 158 """Helper method to verify the existence of a document within MarkLogic. 159 160 :return: `True` if the document exists, `False` otherwise.""" 161 return self.api_client.document_exists(self.uri) 162 163 def docx_exists(self) -> bool: 164 """There is a docx in S3 private bucket for this Document""" 165 return check_docx_exists(self.uri) 166 167 def _initialise_identifiers(self) -> None: 168 """Load this document's identifiers from MarkLogic.""" 169 170 identifiers_element_as_etree = self.api_client.get_property_as_node(self.uri, "identifiers") 171 self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree) 172 173 @property 174 def best_human_identifier(self) -> Optional[Identifier]: 175 """Return the preferred identifier for the document, providing that it is considered human readable.""" 176 preferred_identifier = self.identifiers.preferred() 177 if preferred_identifier and preferred_identifier.schema.human_readable: 178 return preferred_identifier 179 return None 180 181 @property 182 def public_uri(self) -> str: 183 """ 184 :return: The absolute, public URI at which a copy of this document can be found 185 """ 186 return f"https://caselaw.nationalarchives.gov.uk/{self.slug}" 187 188 @cached_property 189 def slug(self) -> str: 190 """ 191 :return: The best public-facing URL for the judgment, which is the slug 192 of the most-preferred identifier, which should either be an NCN or fclid. 193 """ 194 preferred_identifier = self.identifiers.preferred() 195 if preferred_identifier: 196 return preferred_identifier.url_slug 197 msg = f"No preferred identifier exists for {self.uri}" 198 raise RuntimeError(msg) 199 200 @cached_property 201 def is_published(self) -> bool: 202 return self.api_client.get_published(self.uri) 203 204 @cached_property 205 def is_held(self) -> bool: 206 return self.api_client.get_property(self.uri, "editor-hold") == "true" 207 208 @cached_property 209 def is_locked(self) -> bool: 210 return self.checkout_message is not None 211 212 @cached_property 213 def checkout_message(self) -> Optional[str]: 214 return self.api_client.get_judgment_checkout_status_message(self.uri) 215 216 @cached_property 217 def source_name(self) -> str: 218 return self.api_client.get_property(self.uri, "source-name") 219 220 @cached_property 221 def source_email(self) -> str: 222 return self.api_client.get_property(self.uri, "source-email") 223 224 @cached_property 225 def consignment_reference(self) -> str: 226 return self.api_client.get_property(self.uri, "transfer-consignment-reference") 227 228 @property 229 def docx_url(self) -> str: 230 """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.""" 231 return generate_docx_url(self.uri) 232 233 @property 234 def pdf_url(self) -> str: 235 """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.""" 236 return generate_pdf_url(self.uri) 237 238 @cached_property 239 def assigned_to(self) -> str: 240 return self.api_client.get_property(self.uri, "assigned-to") 241 242 @cached_property 243 def versions(self) -> list[VersionsDict]: 244 versions_response = self.api_client.list_judgment_versions(self.uri) 245 246 try: 247 decoded_versions = decoder.MultipartDecoder.from_response(versions_response) 248 return render_versions(decoded_versions.parts) 249 except AttributeError: 250 return [] 251 252 @cached_property 253 def versions_as_documents(self) -> list["Document"]: 254 """ 255 Returns a list of `Document` subclasses corresponding to the versions of the document. The first entry is: 256 * the most recent 257 * the highest numbered 258 259 Note that this is only valid on the managed document -- a `DLS-DOCUMENTVERSION` error will occur if the document 260 this is called on is itself a version. 261 """ 262 if self.is_version: 263 raise NotSupportedOnVersion( 264 f"Cannot get versions of a version for {self.uri}", 265 ) 266 docs = [] 267 for version in self.versions: 268 doc_uri = DocumentURIString(version["uri"]) 269 docs.append(self.api_client.get_document_by_uri(doc_uri)) 270 return docs 271 272 @cached_property 273 def version_number(self) -> int: 274 """ 275 Note that the highest number is the most recent version. 276 Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version) 277 """ 278 version = extract_version(self.uri) 279 if version == 0: 280 raise OnlySupportedOnVersion( 281 f"Version number requested for {self.uri} which is not a version", 282 ) 283 return version 284 285 @cached_property 286 def is_version(self) -> bool: 287 "Is this document a potentially historic version of a document, or is it the main document itself?" 288 return extract_version(self.uri) != 0 289 290 @cached_property 291 def is_failure(self) -> bool: 292 """ 293 Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if: 294 295 - The document entirely failed to parse 296 297 :return: `True` if this document is in a 'failure' state, otherwise `False` 298 """ 299 return self.body.failed_to_parse 300 301 @cached_property 302 def is_parked(self) -> bool: 303 return "parked" in self.uri 304 305 @cached_property 306 def has_name(self) -> bool: 307 return bool(self.body.name) 308 309 @cached_property 310 def has_valid_court(self) -> bool: 311 try: 312 return bool( 313 courts.get_by_code(self.body.court_and_jurisdiction_identifier_string), 314 ) 315 except CourtNotFoundException: 316 return False 317 318 @cached_property 319 def is_publishable(self) -> bool: 320 # If there are any validation failures, there will be no messages in the list. 321 # An empty list (which is falsy) therefore means the judgment can be published safely. 322 return not self.validation_failure_messages 323 324 @cached_property 325 def first_published_datetime(self) -> Optional[datetime.datetime]: 326 """ 327 Return the database value for the date and time this document was first published. 328 329 :return: The datetime value in the database for "first published". 330 """ 331 return self.api_client.get_datetime_property(self.uri, "first_published_datetime") 332 333 @cached_property 334 def first_published_datetime_display(self) -> Optional[datetime.datetime]: 335 """ 336 Return the display value for the date and time this document was first published. 337 338 A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return `None`. This can be used alongside `has_ever_been_published` to indicate an "unknown" state. 339 340 :return: The datetime value to be displayed to end users for "first published". 341 """ 342 343 if self.first_published_datetime == datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc): 344 return None 345 346 return self.first_published_datetime 347 348 @cached_property 349 def has_ever_been_published(self) -> bool: 350 """ 351 Do we consider this document to have ever been published? 352 353 This is `True` if either the document is currently published, or if `first_published_datetime` has any value (including the sentinel value). 354 355 :return: A boolean indicating if the document has ever been published. 356 """ 357 return self.is_published or self.first_published_datetime is not None 358 359 @cached_property 360 def validation_failure_messages(self) -> list[str]: 361 exception_list = [] 362 for function_name, pass_value, message in self.attributes_to_validate: 363 if getattr(self, function_name) != pass_value: 364 exception_list.append(message.format(document_noun=self.document_noun)) 365 return sorted(exception_list) 366 367 @cached_property 368 def annotation(self) -> str: 369 return self.api_client.get_version_annotation(self.uri) 370 371 @cached_property 372 def structured_annotation(self) -> AnnotationDataDict: 373 annotation_data_dict_loader = TypeAdapter(AnnotationDataDict) 374 375 return annotation_data_dict_loader.validate_json(self.annotation) 376 377 @cached_property 378 def has_unique_content_hash(self) -> bool: 379 """Check if the content hash of this document is unique compared to all other documents in MarkLogic.""" 380 return self.api_client.has_unique_content_hash(self.uri) 381 382 @cached_property 383 def has_only_clean_assets(self) -> bool: 384 """False if any non-tar.gz assets associated with this document have not been cleaned.""" 385 return True # TODO: Remove this once we have enabled the asset cleaning pipeline. 386 return are_unpublished_assets_clean(self.uri) 387 388 @cached_property 389 def version_created_datetime(self) -> datetime.datetime: 390 return self.api_client.get_version_created_datetime(self.uri) 391 392 @property 393 def status(self) -> str: 394 if self.is_published: 395 return DOCUMENT_STATUS_PUBLISHED 396 397 if self.is_held: 398 return DOCUMENT_STATUS_HOLD 399 400 if self.assigned_to: 401 return DOCUMENT_STATUS_IN_PROGRESS 402 403 return DOCUMENT_STATUS_NEW 404 405 def force_enrich(self) -> None: 406 """ 407 Request enrichment of the document, but do no checks 408 """ 409 now = datetime.datetime.now(datetime.timezone.utc) 410 self.api_client.set_property( 411 self.uri, 412 "last_sent_to_enrichment", 413 now.isoformat(), 414 ) 415 416 if not self.can_enrich: 417 msg = f"{self.uri} cannot be enriched" 418 raise CannotEnrichUnenrichableDocument(msg) 419 420 announce_document_event( 421 uri=self.uri, 422 status="enrich", 423 enrich=True, 424 ) 425 426 def enrich(self, even_if_recent: bool = False, accept_failures: bool = False) -> bool: 427 """ 428 Request enrichment of a document, if it's sensible to do so. 429 """ 430 if not (even_if_recent) and self.enriched_recently: 431 print("Enrichment not requested as document was enriched recently") 432 return False 433 434 print("Enrichment requested") 435 436 try: 437 self.force_enrich() 438 except CannotEnrichUnenrichableDocument as e: 439 if not accept_failures: 440 raise e 441 return False 442 443 return True 444 445 @cached_property 446 def enriched_recently(self) -> bool: 447 """ 448 Has this document been enriched recently? 449 """ 450 451 last_enrichment = self.body.enrichment_datetime 452 if not last_enrichment: 453 return False 454 455 now = datetime.datetime.now(tz=datetime.timezone.utc) 456 457 return now - last_enrichment < MINIMUM_ENRICHMENT_TIME 458 459 @cached_property 460 def validates_against_schema(self) -> bool: 461 """ 462 Does the document validate against the most recent schema? 463 """ 464 return self.api_client.validate_document(self.uri) 465 466 def assign_fclid_if_missing(self) -> Optional[FindCaseLawIdentifier]: 467 """If the document does not have an FCLID already, mint a new one and save it.""" 468 if len(self.identifiers.of_type(FindCaseLawIdentifier)) == 0: 469 document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client) 470 self.identifiers.add(document_fclid) 471 self.save_identifiers() 472 return document_fclid 473 474 return None 475 476 def publish(self) -> None: 477 """ 478 Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state. 479 480 :raises CannotPublishUnpublishableDocument: This document has not passed the checks in `is_publishable`, and as 481 such cannot be published. 482 """ 483 if not self.is_publishable: 484 raise CannotPublishUnpublishableDocument 485 486 ## Make sure the document has an FCLID 487 self.assign_fclid_if_missing() 488 489 ## Copy the document assets into the appropriate place in S3 490 publish_documents(self.uri) 491 492 ## Set the fact the document is published 493 self.api_client.set_published(self.uri, True) 494 495 ## If necessary, set the first published date 496 if not self.first_published_datetime: 497 self.api_client.set_datetime_property( 498 self.uri, "first_published_datetime", datetime.datetime.now(datetime.timezone.utc) 499 ) 500 501 ## Announce the publication on the event bus 502 announce_document_event( 503 uri=self.uri, 504 status="publish", 505 ) 506 507 ## Send the document off for enrichment, but accept if we can't for any reason 508 self.enrich(accept_failures=True) 509 510 def unpublish(self) -> None: 511 self.api_client.break_checkout(self.uri) 512 unpublish_documents(self.uri) 513 self.api_client.set_published(self.uri, False) 514 announce_document_event( 515 uri=self.uri, 516 status="unpublish", 517 ) 518 519 def hold(self) -> None: 520 self.api_client.set_property(self.uri, "editor-hold", "true") 521 522 def unhold(self) -> None: 523 self.api_client.set_property(self.uri, "editor-hold", "false") 524 525 @cached_property 526 def safe_to_delete(self) -> bool: 527 """ 528 Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available. 529 530 :return: If the document is safe to be deleted 531 """ 532 533 return not self.is_published 534 535 def delete(self) -> None: 536 """ 537 Deletes this document from MarkLogic and any resources from AWS. 538 """ 539 540 if self.safe_to_delete: 541 self.api_client.delete_judgment(self.uri) 542 delete_documents_from_private_bucket(self.uri) 543 else: 544 raise DocumentNotSafeForDeletion 545 546 def move(self, new_citation: NeutralCitationString) -> None: 547 self.api_client.update_document_uri(self.uri, new_citation) 548 549 def force_reparse(self) -> None: 550 "Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI" 551 552 now = datetime.datetime.now(datetime.timezone.utc) 553 self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat()) 554 555 checked_date: Optional[str] = ( 556 self.body.document_date_as_date.isoformat() 557 if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1) 558 else None 559 ) 560 561 # the keys of parser_instructions should exactly match the parser output 562 # in the *-metadata.json files by the parser. Whilst typically empty 563 # values are "" from the API, we should pass None instead in this case. 564 565 parser_instructions: ParserInstructionsDict = { 566 "metadata": { 567 "name": self.body.name or None, 568 "cite": None, 569 "court": self.body.court or None, 570 "date": checked_date, 571 "uri": self.uri, 572 } 573 } 574 575 if self._default_reparse_document_type: 576 parser_instructions["documentType"] = self._default_reparse_document_type 577 578 ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata 579 if hasattr(self, "neutral_citation"): 580 parser_instructions["metadata"]["cite"] = self.neutral_citation 581 582 request_parse( 583 uri=self.uri, 584 reference=self.consignment_reference, 585 parser_instructions=parser_instructions, 586 ) 587 588 def reparse(self) -> bool: 589 # note that we set 'last_sent_to_parser' even if we can't send it to the parser 590 # it means 'last tried to reparse' much more consistently. 591 now = datetime.datetime.now(datetime.timezone.utc) 592 self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat()) 593 if self.can_reparse: 594 self.force_reparse() 595 return True 596 return False 597 598 @cached_property 599 def can_reparse(self) -> bool: 600 """ 601 Is it sensible to reparse this document? 602 """ 603 return self.docx_exists() and not self.body.has_external_data 604 605 @cached_property 606 def can_enrich(self) -> bool: 607 """ 608 Is it possible to enrich this document? 609 """ 610 return self.body.has_content and not self.body.has_external_data 611 612 def validate_identifiers(self) -> SuccessFailureMessageTuple: 613 return self.identifiers.perform_all_validations(document_type=type(self), api_client=self.api_client) 614 615 def save_identifiers(self) -> None: 616 """Validate the identifiers, and if the validation passes save them to MarkLogic""" 617 validations = self.validate_identifiers() 618 if validations.success is True: 619 self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree) 620 else: 621 raise IdentifierValidationException( 622 "Unable to save identifiers; validation constraints not met: " + ", ".join(validations.messages) 623 ) 624 625 def __getattr__(self, name: str) -> Any: 626 warnings.warn(f"{name} no longer exists on Document, using Document.body instead", DeprecationWarning) 627 try: 628 return getattr(self.body, name) 629 except Exception: 630 raise AttributeError(f"Neither 'Document' nor 'DocumentBody' objects have an attribute '{name}'") 631 632 def linked_document_resolutions(self, namespaces: list[str], only_published: bool = True) -> IdentifierResolutions: 633 """Get document resolutions which share the same neutral citation as this document.""" 634 if not hasattr(self, "neutral_citation") or not self.neutral_citation: 635 return IdentifierResolutions([]) 636 637 resolutions = self.api_client.resolve_from_identifier_value(self.neutral_citation) 638 if only_published: 639 resolutions = resolutions.published() 640 641 # only documents which aren't this one and have a right namespace 642 return IdentifierResolutions( 643 [ 644 resolution 645 for resolution in resolutions 646 if resolution.document_uri != self.uri.as_marklogic() and resolution.identifier_namespace in namespaces 647 ] 648 ) 649 650 def linked_documents(self, namespaces: list[str], only_published: bool = True) -> list["Document"]: 651 resolutions = self.linked_document_resolutions(namespaces=namespaces, only_published=only_published) 652 return [ 653 Document(resolution.document_uri.as_document_uri(), api_client=self.api_client) 654 for resolution in resolutions 655 ] 656 657 def content_as_html(self) -> str | None: 658 xlst_image_location = os.getenv("XSLT_IMAGE_LOCATION", "") 659 return self.body.content_html(f"{xlst_image_location}/{self.uri}") 660 661 def xml_with_correct_frbr(self) -> bytes: 662 """Dynamically modify FRBR uris to reflect current storage location and FCL id""" 663 fcl_identifiers = self.identifiers.of_type(FindCaseLawIdentifier) 664 work_uri = f"https://caselaw.nationalarchives.gov.uk/id/doc/{fcl_identifiers[0].value}" 665 expression_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}" 666 manifestation_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}/data.xml" 667 return self.body.apply_xslt( 668 "modify_xml_live.xsl", work_uri=work_uri, expression_uri=expression_uri, manifestation_uri=manifestation_uri 669 ) 670 671 def compare_to(self, that_doc: "Document") -> comparison.Comparison: 672 return comparison.Comparison(self, that_doc)
A base class from which all other document types are extensions. This class includes the essential methods for retrieving and manipulating a document within MarkLogic.
129 def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None): 130 """ 131 :param uri: The URI of the document to retrieve from MarkLogic. 132 :param api_client: An instance of the API client object to handle communication with the MarkLogic server. 133 :param search_query: Optionally, a search string which should be highlighted if it appears in the document body. 134 135 :raises DocumentNotFoundError: The document does not exist within MarkLogic 136 """ 137 self.uri: DocumentURIString = uri 138 self.api_client: MarklogicApiClient = api_client 139 if not self.document_exists(): 140 raise DocumentNotFoundError(f"Document {self.uri} does not exist") 141 142 self.body: DocumentBody = DocumentBody( 143 xml_bytestring=self.api_client.get_judgment_xml_bytestring( 144 self.uri, 145 show_unpublished=True, 146 search_query=search_query, 147 ), 148 ) 149 """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """ 150 151 self._initialise_identifiers()
Parameters
- uri: The URI of the document to retrieve from MarkLogic.
- api_client: An instance of the API client object to handle communication with the MarkLogic server.
- search_query: Optionally, a search string which should be highlighted if it appears in the document body.
Raises
- DocumentNotFoundError: The document does not exist within MarkLogic
A list of tuples in the form:
(
attribute_name,
passing_condition,
error_message,
)
describing attributes which should be checked in order for a document to be considered valid.
Individual document classes should extend this list where necessary to validate document type-specific attributes.
Document.body represents the body of the document itself, without any information such as version tracking or properties.
157 def document_exists(self) -> bool: 158 """Helper method to verify the existence of a document within MarkLogic. 159 160 :return: `True` if the document exists, `False` otherwise.""" 161 return self.api_client.document_exists(self.uri)
Helper method to verify the existence of a document within MarkLogic.
Returns
Trueif the document exists,Falseotherwise.
163 def docx_exists(self) -> bool: 164 """There is a docx in S3 private bucket for this Document""" 165 return check_docx_exists(self.uri)
There is a docx in S3 private bucket for this Document
173 @property 174 def best_human_identifier(self) -> Optional[Identifier]: 175 """Return the preferred identifier for the document, providing that it is considered human readable.""" 176 preferred_identifier = self.identifiers.preferred() 177 if preferred_identifier and preferred_identifier.schema.human_readable: 178 return preferred_identifier 179 return None
Return the preferred identifier for the document, providing that it is considered human readable.
181 @property 182 def public_uri(self) -> str: 183 """ 184 :return: The absolute, public URI at which a copy of this document can be found 185 """ 186 return f"https://caselaw.nationalarchives.gov.uk/{self.slug}"
Returns
The absolute, public URI at which a copy of this document can be found
188 @cached_property 189 def slug(self) -> str: 190 """ 191 :return: The best public-facing URL for the judgment, which is the slug 192 of the most-preferred identifier, which should either be an NCN or fclid. 193 """ 194 preferred_identifier = self.identifiers.preferred() 195 if preferred_identifier: 196 return preferred_identifier.url_slug 197 msg = f"No preferred identifier exists for {self.uri}" 198 raise RuntimeError(msg)
Returns
The best public-facing URL for the judgment, which is the slug of the most-preferred identifier, which should either be an NCN or fclid.
228 @property 229 def docx_url(self) -> str: 230 """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.""" 231 return generate_docx_url(self.uri)
This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.
233 @property 234 def pdf_url(self) -> str: 235 """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.""" 236 return generate_pdf_url(self.uri)
This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.
242 @cached_property 243 def versions(self) -> list[VersionsDict]: 244 versions_response = self.api_client.list_judgment_versions(self.uri) 245 246 try: 247 decoded_versions = decoder.MultipartDecoder.from_response(versions_response) 248 return render_versions(decoded_versions.parts) 249 except AttributeError: 250 return []
252 @cached_property 253 def versions_as_documents(self) -> list["Document"]: 254 """ 255 Returns a list of `Document` subclasses corresponding to the versions of the document. The first entry is: 256 * the most recent 257 * the highest numbered 258 259 Note that this is only valid on the managed document -- a `DLS-DOCUMENTVERSION` error will occur if the document 260 this is called on is itself a version. 261 """ 262 if self.is_version: 263 raise NotSupportedOnVersion( 264 f"Cannot get versions of a version for {self.uri}", 265 ) 266 docs = [] 267 for version in self.versions: 268 doc_uri = DocumentURIString(version["uri"]) 269 docs.append(self.api_client.get_document_by_uri(doc_uri)) 270 return docs
Returns a list of Document subclasses corresponding to the versions of the document. The first entry is:
- the most recent
- the highest numbered
Note that this is only valid on the managed document -- a DLS-DOCUMENTVERSION error will occur if the document
this is called on is itself a version.
272 @cached_property 273 def version_number(self) -> int: 274 """ 275 Note that the highest number is the most recent version. 276 Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version) 277 """ 278 version = extract_version(self.uri) 279 if version == 0: 280 raise OnlySupportedOnVersion( 281 f"Version number requested for {self.uri} which is not a version", 282 ) 283 return version
Note that the highest number is the most recent version. Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version)
285 @cached_property 286 def is_version(self) -> bool: 287 "Is this document a potentially historic version of a document, or is it the main document itself?" 288 return extract_version(self.uri) != 0
Is this document a potentially historic version of a document, or is it the main document itself?
290 @cached_property 291 def is_failure(self) -> bool: 292 """ 293 Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if: 294 295 - The document entirely failed to parse 296 297 :return: `True` if this document is in a 'failure' state, otherwise `False` 298 """ 299 return self.body.failed_to_parse
Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if:
- The document entirely failed to parse
Returns
Trueif this document is in a 'failure' state, otherwiseFalse
324 @cached_property 325 def first_published_datetime(self) -> Optional[datetime.datetime]: 326 """ 327 Return the database value for the date and time this document was first published. 328 329 :return: The datetime value in the database for "first published". 330 """ 331 return self.api_client.get_datetime_property(self.uri, "first_published_datetime")
Return the database value for the date and time this document was first published.
Returns
The datetime value in the database for "first published".
333 @cached_property 334 def first_published_datetime_display(self) -> Optional[datetime.datetime]: 335 """ 336 Return the display value for the date and time this document was first published. 337 338 A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return `None`. This can be used alongside `has_ever_been_published` to indicate an "unknown" state. 339 340 :return: The datetime value to be displayed to end users for "first published". 341 """ 342 343 if self.first_published_datetime == datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc): 344 return None 345 346 return self.first_published_datetime
Return the display value for the date and time this document was first published.
A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return None. This can be used alongside has_ever_been_published to indicate an "unknown" state.
Returns
The datetime value to be displayed to end users for "first published".
348 @cached_property 349 def has_ever_been_published(self) -> bool: 350 """ 351 Do we consider this document to have ever been published? 352 353 This is `True` if either the document is currently published, or if `first_published_datetime` has any value (including the sentinel value). 354 355 :return: A boolean indicating if the document has ever been published. 356 """ 357 return self.is_published or self.first_published_datetime is not None
Do we consider this document to have ever been published?
This is True if either the document is currently published, or if first_published_datetime has any value (including the sentinel value).
Returns
A boolean indicating if the document has ever been published.
359 @cached_property 360 def validation_failure_messages(self) -> list[str]: 361 exception_list = [] 362 for function_name, pass_value, message in self.attributes_to_validate: 363 if getattr(self, function_name) != pass_value: 364 exception_list.append(message.format(document_noun=self.document_noun)) 365 return sorted(exception_list)
377 @cached_property 378 def has_unique_content_hash(self) -> bool: 379 """Check if the content hash of this document is unique compared to all other documents in MarkLogic.""" 380 return self.api_client.has_unique_content_hash(self.uri)
Check if the content hash of this document is unique compared to all other documents in MarkLogic.
382 @cached_property 383 def has_only_clean_assets(self) -> bool: 384 """False if any non-tar.gz assets associated with this document have not been cleaned.""" 385 return True # TODO: Remove this once we have enabled the asset cleaning pipeline. 386 return are_unpublished_assets_clean(self.uri)
False if any non-tar.gz assets associated with this document have not been cleaned.
405 def force_enrich(self) -> None: 406 """ 407 Request enrichment of the document, but do no checks 408 """ 409 now = datetime.datetime.now(datetime.timezone.utc) 410 self.api_client.set_property( 411 self.uri, 412 "last_sent_to_enrichment", 413 now.isoformat(), 414 ) 415 416 if not self.can_enrich: 417 msg = f"{self.uri} cannot be enriched" 418 raise CannotEnrichUnenrichableDocument(msg) 419 420 announce_document_event( 421 uri=self.uri, 422 status="enrich", 423 enrich=True, 424 )
Request enrichment of the document, but do no checks
426 def enrich(self, even_if_recent: bool = False, accept_failures: bool = False) -> bool: 427 """ 428 Request enrichment of a document, if it's sensible to do so. 429 """ 430 if not (even_if_recent) and self.enriched_recently: 431 print("Enrichment not requested as document was enriched recently") 432 return False 433 434 print("Enrichment requested") 435 436 try: 437 self.force_enrich() 438 except CannotEnrichUnenrichableDocument as e: 439 if not accept_failures: 440 raise e 441 return False 442 443 return True
Request enrichment of a document, if it's sensible to do so.
445 @cached_property 446 def enriched_recently(self) -> bool: 447 """ 448 Has this document been enriched recently? 449 """ 450 451 last_enrichment = self.body.enrichment_datetime 452 if not last_enrichment: 453 return False 454 455 now = datetime.datetime.now(tz=datetime.timezone.utc) 456 457 return now - last_enrichment < MINIMUM_ENRICHMENT_TIME
Has this document been enriched recently?
459 @cached_property 460 def validates_against_schema(self) -> bool: 461 """ 462 Does the document validate against the most recent schema? 463 """ 464 return self.api_client.validate_document(self.uri)
Does the document validate against the most recent schema?
466 def assign_fclid_if_missing(self) -> Optional[FindCaseLawIdentifier]: 467 """If the document does not have an FCLID already, mint a new one and save it.""" 468 if len(self.identifiers.of_type(FindCaseLawIdentifier)) == 0: 469 document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client) 470 self.identifiers.add(document_fclid) 471 self.save_identifiers() 472 return document_fclid 473 474 return None
If the document does not have an FCLID already, mint a new one and save it.
476 def publish(self) -> None: 477 """ 478 Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state. 479 480 :raises CannotPublishUnpublishableDocument: This document has not passed the checks in `is_publishable`, and as 481 such cannot be published. 482 """ 483 if not self.is_publishable: 484 raise CannotPublishUnpublishableDocument 485 486 ## Make sure the document has an FCLID 487 self.assign_fclid_if_missing() 488 489 ## Copy the document assets into the appropriate place in S3 490 publish_documents(self.uri) 491 492 ## Set the fact the document is published 493 self.api_client.set_published(self.uri, True) 494 495 ## If necessary, set the first published date 496 if not self.first_published_datetime: 497 self.api_client.set_datetime_property( 498 self.uri, "first_published_datetime", datetime.datetime.now(datetime.timezone.utc) 499 ) 500 501 ## Announce the publication on the event bus 502 announce_document_event( 503 uri=self.uri, 504 status="publish", 505 ) 506 507 ## Send the document off for enrichment, but accept if we can't for any reason 508 self.enrich(accept_failures=True)
Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state.
Raises
- CannotPublishUnpublishableDocument: This document has not passed the checks in
is_publishable, and as such cannot be published.
525 @cached_property 526 def safe_to_delete(self) -> bool: 527 """ 528 Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available. 529 530 :return: If the document is safe to be deleted 531 """ 532 533 return not self.is_published
Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available.
Returns
If the document is safe to be deleted
535 def delete(self) -> None: 536 """ 537 Deletes this document from MarkLogic and any resources from AWS. 538 """ 539 540 if self.safe_to_delete: 541 self.api_client.delete_judgment(self.uri) 542 delete_documents_from_private_bucket(self.uri) 543 else: 544 raise DocumentNotSafeForDeletion
Deletes this document from MarkLogic and any resources from AWS.
549 def force_reparse(self) -> None: 550 "Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI" 551 552 now = datetime.datetime.now(datetime.timezone.utc) 553 self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat()) 554 555 checked_date: Optional[str] = ( 556 self.body.document_date_as_date.isoformat() 557 if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1) 558 else None 559 ) 560 561 # the keys of parser_instructions should exactly match the parser output 562 # in the *-metadata.json files by the parser. Whilst typically empty 563 # values are "" from the API, we should pass None instead in this case. 564 565 parser_instructions: ParserInstructionsDict = { 566 "metadata": { 567 "name": self.body.name or None, 568 "cite": None, 569 "court": self.body.court or None, 570 "date": checked_date, 571 "uri": self.uri, 572 } 573 } 574 575 if self._default_reparse_document_type: 576 parser_instructions["documentType"] = self._default_reparse_document_type 577 578 ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata 579 if hasattr(self, "neutral_citation"): 580 parser_instructions["metadata"]["cite"] = self.neutral_citation 581 582 request_parse( 583 uri=self.uri, 584 reference=self.consignment_reference, 585 parser_instructions=parser_instructions, 586 )
Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI
588 def reparse(self) -> bool: 589 # note that we set 'last_sent_to_parser' even if we can't send it to the parser 590 # it means 'last tried to reparse' much more consistently. 591 now = datetime.datetime.now(datetime.timezone.utc) 592 self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat()) 593 if self.can_reparse: 594 self.force_reparse() 595 return True 596 return False
598 @cached_property 599 def can_reparse(self) -> bool: 600 """ 601 Is it sensible to reparse this document? 602 """ 603 return self.docx_exists() and not self.body.has_external_data
Is it sensible to reparse this document?
605 @cached_property 606 def can_enrich(self) -> bool: 607 """ 608 Is it possible to enrich this document? 609 """ 610 return self.body.has_content and not self.body.has_external_data
Is it possible to enrich this document?
615 def save_identifiers(self) -> None: 616 """Validate the identifiers, and if the validation passes save them to MarkLogic""" 617 validations = self.validate_identifiers() 618 if validations.success is True: 619 self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree) 620 else: 621 raise IdentifierValidationException( 622 "Unable to save identifiers; validation constraints not met: " + ", ".join(validations.messages) 623 )
Validate the identifiers, and if the validation passes save them to MarkLogic
632 def linked_document_resolutions(self, namespaces: list[str], only_published: bool = True) -> IdentifierResolutions: 633 """Get document resolutions which share the same neutral citation as this document.""" 634 if not hasattr(self, "neutral_citation") or not self.neutral_citation: 635 return IdentifierResolutions([]) 636 637 resolutions = self.api_client.resolve_from_identifier_value(self.neutral_citation) 638 if only_published: 639 resolutions = resolutions.published() 640 641 # only documents which aren't this one and have a right namespace 642 return IdentifierResolutions( 643 [ 644 resolution 645 for resolution in resolutions 646 if resolution.document_uri != self.uri.as_marklogic() and resolution.identifier_namespace in namespaces 647 ] 648 )
Get document resolutions which share the same neutral citation as this document.
650 def linked_documents(self, namespaces: list[str], only_published: bool = True) -> list["Document"]: 651 resolutions = self.linked_document_resolutions(namespaces=namespaces, only_published=only_published) 652 return [ 653 Document(resolution.document_uri.as_document_uri(), api_client=self.api_client) 654 for resolution in resolutions 655 ]
661 def xml_with_correct_frbr(self) -> bytes: 662 """Dynamically modify FRBR uris to reflect current storage location and FCL id""" 663 fcl_identifiers = self.identifiers.of_type(FindCaseLawIdentifier) 664 work_uri = f"https://caselaw.nationalarchives.gov.uk/id/doc/{fcl_identifiers[0].value}" 665 expression_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}" 666 manifestation_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}/data.xml" 667 return self.body.apply_xslt( 668 "modify_xml_live.xsl", work_uri=work_uri, expression_uri=expression_uri, manifestation_uri=manifestation_uri 669 )
Dynamically modify FRBR uris to reflect current storage location and FCL id