caselawclient.models.documents

View Source

  1import datetime
  2import os
  3import warnings
  4from functools import cached_property
  5from typing import TYPE_CHECKING, Any, Optional
  6
  7from ds_caselaw_utils import courts
  8from ds_caselaw_utils.courts import CourtNotFoundException
  9from ds_caselaw_utils.types import NeutralCitationString
 10from pydantic import TypeAdapter
 11from requests_toolbelt.multipart import decoder
 12
 13import caselawclient.models.documents.comparison as comparison
 14from caselawclient.errors import (
 15    DocumentNotFoundError,
 16    NotSupportedOnVersion,
 17    OnlySupportedOnVersion,
 18)
 19from caselawclient.identifier_resolution import IdentifierResolutions
 20from caselawclient.models.documents.versions import AnnotationDataDict
 21from caselawclient.models.identifiers import Identifier
 22from caselawclient.models.identifiers.exceptions import IdentifierValidationException
 23from caselawclient.models.identifiers.fclid import FindCaseLawIdentifier, FindCaseLawIdentifierSchema
 24from caselawclient.models.identifiers.unpacker import unpack_all_identifiers_from_etree
 25from caselawclient.models.utilities import VersionsDict, extract_version, render_versions
 26from caselawclient.models.utilities.aws import (
 27    ParserInstructionsDict,
 28    announce_document_event,
 29    are_unpublished_assets_clean,
 30    check_docx_exists,
 31    delete_documents_from_private_bucket,
 32    generate_docx_url,
 33    generate_pdf_url,
 34    publish_documents,
 35    request_parse,
 36    unpublish_documents,
 37)
 38from caselawclient.types import DocumentURIString, SuccessFailureMessageTuple
 39
 40from .body import DocumentBody
 41from .exceptions import CannotEnrichUnenrichableDocument, CannotPublishUnpublishableDocument, DocumentNotSafeForDeletion
 42from .statuses import DOCUMENT_STATUS_HOLD, DOCUMENT_STATUS_IN_PROGRESS, DOCUMENT_STATUS_NEW, DOCUMENT_STATUS_PUBLISHED
 43
 44MINIMUM_ENRICHMENT_TIME = datetime.timedelta(minutes=20)
 45
 46
 47class GatewayTimeoutGettingHTMLWithQuery(RuntimeWarning):
 48    pass
 49
 50
 51DOCUMENT_COLLECTION_URI_JUDGMENT = "judgment"
 52DOCUMENT_COLLECTION_URI_PRESS_SUMMARY = "press-summary"
 53
 54if TYPE_CHECKING:
 55    from caselawclient.Client import MarklogicApiClient
 56
 57
 58class Document:
 59    """
 60    A base class from which all other document types are extensions. This class includes the essential methods for
 61    retrieving and manipulating a document within MarkLogic.
 62    """
 63
 64    document_noun = "document"
 65    """ The noun for a single instance of this document type. """
 66
 67    document_noun_plural = "documents"
 68    """ The noun for a plural of this document type. """
 69
 70    _default_reparse_document_type: Optional[str] = None
 71    """ The default noun to pass to the parser when reparsing given the document type if known. This is used to determine how the document should be parsed and processed."""
 72
 73    type_collection_name: str
 74
 75    attributes_to_validate: list[tuple[str, bool, str]] = [
 76        (
 77            "is_failure",
 78            False,
 79            "This document failed to parse",
 80        ),
 81        (
 82            "is_parked",
 83            False,
 84            "This {document_noun} is currently parked at a temporary URI",
 85        ),
 86        (
 87            "is_held",
 88            False,
 89            "This {document_noun} is currently on hold",
 90        ),
 91        (
 92            "has_name",
 93            True,
 94            "This {document_noun} has no name",
 95        ),
 96        (
 97            "has_valid_court",
 98            True,
 99            "The court for this {document_noun} is not valid",
100        ),
101        (
102            "has_unique_content_hash",
103            True,
104            "There is another document with identical content",
105        ),
106        (
107            "has_only_clean_assets",
108            True,
109            "An uncleaned asset exists for this document",
110        ),
111    ]
112    """
113    A list of tuples in the form:
114
115    ``` python
116    (
117        attribute_name,
118        passing_condition,
119        error_message,
120    )
121    ```
122
123    describing attributes which should be checked in order for a document to be considered valid.
124
125    Individual document classes should extend this list where necessary to validate document type-specific attributes.
126    """
127
128    def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None):
129        """
130        :param uri: The URI of the document to retrieve from MarkLogic.
131        :param api_client: An instance of the API client object to handle communication with the MarkLogic server.
132        :param search_query: Optionally, a search string which should be highlighted if it appears in the document body.
133
134        :raises DocumentNotFoundError: The document does not exist within MarkLogic
135        """
136        self.uri: DocumentURIString = uri
137        self.api_client: MarklogicApiClient = api_client
138        if not self.document_exists():
139            raise DocumentNotFoundError(f"Document {self.uri} does not exist")
140
141        self.body: DocumentBody = DocumentBody(
142            xml_bytestring=self.api_client.get_judgment_xml_bytestring(
143                self.uri,
144                show_unpublished=True,
145                search_query=search_query,
146            ),
147        )
148        """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """
149
150        self._initialise_identifiers()
151
152    def __repr__(self) -> str:
153        name = self.body.name or "un-named"
154        return f"<{self.document_noun} {self.uri}: {name}>"
155
156    def document_exists(self) -> bool:
157        """Helper method to verify the existence of a document within MarkLogic.
158
159        :return: `True` if the document exists, `False` otherwise."""
160        return self.api_client.document_exists(self.uri)
161
162    def docx_exists(self) -> bool:
163        """There is a docx in S3 private bucket for this Document"""
164        return check_docx_exists(self.uri)
165
166    def _initialise_identifiers(self) -> None:
167        """Load this document's identifiers from MarkLogic."""
168
169        identifiers_element_as_etree = self.api_client.get_property_as_node(self.uri, "identifiers")
170        self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree)
171
172    @property
173    def best_human_identifier(self) -> Optional[Identifier]:
174        """Return the preferred identifier for the document, providing that it is considered human readable."""
175        preferred_identifier = self.identifiers.preferred()
176        if preferred_identifier and preferred_identifier.schema.human_readable:
177            return preferred_identifier
178        return None
179
180    @property
181    def public_uri(self) -> str:
182        """
183        :return: The absolute, public URI at which a copy of this document can be found
184        """
185        return f"https://caselaw.nationalarchives.gov.uk/{self.slug}"
186
187    @cached_property
188    def slug(self) -> str:
189        """
190        :return: The best public-facing URL for the judgment, which is the slug
191        of the most-preferred identifier, which should either be an NCN or fclid.
192        """
193        preferred_identifier = self.identifiers.preferred()
194        if preferred_identifier:
195            return preferred_identifier.url_slug
196        msg = f"No preferred identifier exists for {self.uri}"
197        raise RuntimeError(msg)
198
199    @cached_property
200    def is_published(self) -> bool:
201        return self.api_client.get_published(self.uri)
202
203    @cached_property
204    def is_held(self) -> bool:
205        return self.api_client.get_property(self.uri, "editor-hold") == "true"
206
207    @cached_property
208    def is_locked(self) -> bool:
209        return self.checkout_message is not None
210
211    @cached_property
212    def checkout_message(self) -> Optional[str]:
213        return self.api_client.get_judgment_checkout_status_message(self.uri)
214
215    @cached_property
216    def source_name(self) -> str:
217        return self.api_client.get_property(self.uri, "source-name")
218
219    @cached_property
220    def source_email(self) -> str:
221        return self.api_client.get_property(self.uri, "source-email")
222
223    @cached_property
224    def consignment_reference(self) -> str:
225        return self.api_client.get_property(self.uri, "transfer-consignment-reference")
226
227    @property
228    def docx_url(self) -> str:
229        """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts."""
230        return generate_docx_url(self.uri)
231
232    @property
233    def pdf_url(self) -> str:
234        """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts."""
235        return generate_pdf_url(self.uri)
236
237    @cached_property
238    def assigned_to(self) -> str:
239        return self.api_client.get_property(self.uri, "assigned-to")
240
241    @cached_property
242    def versions(self) -> list[VersionsDict]:
243        versions_response = self.api_client.list_judgment_versions(self.uri)
244
245        try:
246            decoded_versions = decoder.MultipartDecoder.from_response(versions_response)
247            return render_versions(decoded_versions.parts)
248        except AttributeError:
249            return []
250
251    @cached_property
252    def versions_as_documents(self) -> list["Document"]:
253        """
254        Returns a list of `Document` subclasses corresponding to the versions of the document. The first entry is:
255           * the most recent
256           * the highest numbered
257
258        Note that this is only valid on the managed document -- a `DLS-DOCUMENTVERSION` error will occur if the document
259        this is called on is itself a version.
260        """
261        if self.is_version:
262            raise NotSupportedOnVersion(
263                f"Cannot get versions of a version for {self.uri}",
264            )
265        docs = []
266        for version in self.versions:
267            doc_uri = DocumentURIString(version["uri"])
268            docs.append(self.api_client.get_document_by_uri(doc_uri))
269        return docs
270
271    @cached_property
272    def version_number(self) -> int:
273        """
274        Note that the highest number is the most recent version.
275        Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version)
276        """
277        version = extract_version(self.uri)
278        if version == 0:
279            raise OnlySupportedOnVersion(
280                f"Version number requested for {self.uri} which is not a version",
281            )
282        return version
283
284    @cached_property
285    def is_version(self) -> bool:
286        "Is this document a potentially historic version of a document, or is it the main document itself?"
287        return extract_version(self.uri) != 0
288
289    @cached_property
290    def is_failure(self) -> bool:
291        """
292        Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if:
293
294        - The document entirely failed to parse
295
296        :return: `True` if this document is in a 'failure' state, otherwise `False`
297        """
298        return self.body.failed_to_parse
299
300    @cached_property
301    def is_parked(self) -> bool:
302        return "parked" in self.uri
303
304    @cached_property
305    def has_name(self) -> bool:
306        return bool(self.body.name)
307
308    @cached_property
309    def has_valid_court(self) -> bool:
310        try:
311            return bool(
312                courts.get_by_code(self.body.court_and_jurisdiction_identifier_string),
313            )
314        except CourtNotFoundException:
315            return False
316
317    @cached_property
318    def is_publishable(self) -> bool:
319        # If there are any validation failures, there will be no messages in the list.
320        # An empty list (which is falsy) therefore means the judgment can be published safely.
321        return not self.validation_failure_messages
322
323    @cached_property
324    def first_published_datetime(self) -> Optional[datetime.datetime]:
325        """
326        Return the database value for the date and time this document was first published.
327
328        :return: The datetime value in the database for "first published".
329        """
330        return self.api_client.get_datetime_property(self.uri, "first_published_datetime")
331
332    @cached_property
333    def first_published_datetime_display(self) -> Optional[datetime.datetime]:
334        """
335        Return the display value for the date and time this document was first published.
336
337        A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return `None`. This can be used alongside `has_ever_been_published` to indicate an "unknown" state.
338
339        :return: The datetime value to be displayed to end users for "first published".
340        """
341
342        if self.first_published_datetime == datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc):
343            return None
344
345        return self.first_published_datetime
346
347    @cached_property
348    def has_ever_been_published(self) -> bool:
349        """
350        Do we consider this document to have ever been published?
351
352        This is `True` if either the document is currently published, or if `first_published_datetime` has any value (including the sentinel value).
353
354        :return: A boolean indicating if the document has ever been published.
355        """
356        return self.is_published or self.first_published_datetime is not None
357
358    @cached_property
359    def validation_failure_messages(self) -> list[str]:
360        exception_list = []
361        for function_name, pass_value, message in self.attributes_to_validate:
362            if getattr(self, function_name) != pass_value:
363                exception_list.append(message.format(document_noun=self.document_noun))
364        return sorted(exception_list)
365
366    @cached_property
367    def annotation(self) -> str:
368        return self.api_client.get_version_annotation(self.uri)
369
370    @cached_property
371    def structured_annotation(self) -> AnnotationDataDict:
372        annotation_data_dict_loader = TypeAdapter(AnnotationDataDict)
373
374        return annotation_data_dict_loader.validate_json(self.annotation)
375
376    @cached_property
377    def has_unique_content_hash(self) -> bool:
378        """Check if the content hash of this document is unique compared to all other documents in MarkLogic."""
379        return self.api_client.has_unique_content_hash(self.uri)
380
381    @cached_property
382    def has_only_clean_assets(self) -> bool:
383        """False if any non-tar.gz assets associated with this document have not been cleaned."""
384        return True  # TODO: Remove this once we have enabled the asset cleaning pipeline.
385        return are_unpublished_assets_clean(self.uri)
386
387    @cached_property
388    def version_created_datetime(self) -> datetime.datetime:
389        return self.api_client.get_version_created_datetime(self.uri)
390
391    @property
392    def status(self) -> str:
393        if self.is_published:
394            return DOCUMENT_STATUS_PUBLISHED
395
396        if self.is_held:
397            return DOCUMENT_STATUS_HOLD
398
399        if self.assigned_to:
400            return DOCUMENT_STATUS_IN_PROGRESS
401
402        return DOCUMENT_STATUS_NEW
403
404    def force_enrich(self) -> None:
405        """
406        Request enrichment of the document, but do no checks
407        """
408        now = datetime.datetime.now(datetime.timezone.utc)
409        self.api_client.set_property(
410            self.uri,
411            "last_sent_to_enrichment",
412            now.isoformat(),
413        )
414
415        if not self.can_enrich:
416            msg = f"{self.uri} cannot be enriched"
417            raise CannotEnrichUnenrichableDocument(msg)
418
419        announce_document_event(
420            uri=self.uri,
421            status="enrich",
422            enrich=True,
423        )
424
425    def enrich(self, even_if_recent: bool = False, accept_failures: bool = False) -> bool:
426        """
427        Request enrichment of a document, if it's sensible to do so.
428        """
429        if not (even_if_recent) and self.enriched_recently:
430            print("Enrichment not requested as document was enriched recently")
431            return False
432
433        print("Enrichment requested")
434
435        try:
436            self.force_enrich()
437        except CannotEnrichUnenrichableDocument as e:
438            if not accept_failures:
439                raise e
440            return False
441
442        return True
443
444    @cached_property
445    def enriched_recently(self) -> bool:
446        """
447        Has this document been enriched recently?
448        """
449
450        last_enrichment = self.body.enrichment_datetime
451        if not last_enrichment:
452            return False
453
454        now = datetime.datetime.now(tz=datetime.timezone.utc)
455
456        return now - last_enrichment < MINIMUM_ENRICHMENT_TIME
457
458    @cached_property
459    def validates_against_schema(self) -> bool:
460        """
461        Does the document validate against the most recent schema?
462        """
463        return self.api_client.validate_document(self.uri)
464
465    def assign_fclid_if_missing(self) -> Optional[FindCaseLawIdentifier]:
466        """If the document does not have an FCLID already, mint a new one and save it."""
467        if len(self.identifiers.of_type(FindCaseLawIdentifier)) == 0:
468            document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client)
469            self.identifiers.add(document_fclid)
470            self.save_identifiers()
471            return document_fclid
472
473        return None
474
475    def publish(self) -> None:
476        """
477        Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state.
478
479        :raises CannotPublishUnpublishableDocument: This document has not passed the checks in `is_publishable`, and as
480        such cannot be published.
481        """
482        if not self.is_publishable:
483            raise CannotPublishUnpublishableDocument
484
485        ## Make sure the document has an FCLID
486        self.assign_fclid_if_missing()
487
488        ## Copy the document assets into the appropriate place in S3
489        publish_documents(self.uri)
490
491        ## Set the fact the document is published
492        self.api_client.set_published(self.uri, True)
493
494        ## If necessary, set the first published date
495        if not self.first_published_datetime:
496            self.api_client.set_datetime_property(
497                self.uri, "first_published_datetime", datetime.datetime.now(datetime.timezone.utc)
498            )
499
500        ## Announce the publication on the event bus
501        announce_document_event(
502            uri=self.uri,
503            status="publish",
504        )
505
506        ## Send the document off for enrichment, but accept if we can't for any reason
507        self.enrich(accept_failures=True)
508
509    def unpublish(self) -> None:
510        self.api_client.break_checkout(self.uri)
511        unpublish_documents(self.uri)
512        self.api_client.set_published(self.uri, False)
513        announce_document_event(
514            uri=self.uri,
515            status="unpublish",
516        )
517
518    def hold(self) -> None:
519        self.api_client.set_property(self.uri, "editor-hold", "true")
520
521    def unhold(self) -> None:
522        self.api_client.set_property(self.uri, "editor-hold", "false")
523
524    @cached_property
525    def safe_to_delete(self) -> bool:
526        """
527        Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available.
528
529        :return: If the document is safe to be deleted
530        """
531
532        return not self.is_published
533
534    def delete(self) -> None:
535        """
536        Deletes this document from MarkLogic and any resources from AWS.
537        """
538
539        if self.safe_to_delete:
540            self.api_client.delete_judgment(self.uri)
541            delete_documents_from_private_bucket(self.uri)
542        else:
543            raise DocumentNotSafeForDeletion
544
545    def move(self, new_citation: NeutralCitationString) -> None:
546        self.api_client.update_document_uri(self.uri, new_citation)
547
548    def force_reparse(self) -> None:
549        "Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI"
550
551        now = datetime.datetime.now(datetime.timezone.utc)
552        self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
553
554        checked_date: Optional[str] = (
555            self.body.document_date_as_date.isoformat()
556            if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1)
557            else None
558        )
559
560        # the keys of parser_instructions should exactly match the parser output
561        # in the *-metadata.json files by the parser. Whilst typically empty
562        # values are "" from the API, we should pass None instead in this case.
563
564        parser_instructions: ParserInstructionsDict = {
565            "metadata": {
566                "name": self.body.name or None,
567                "cite": None,
568                "court": self.body.court or None,
569                "date": checked_date,
570                "uri": self.uri,
571            }
572        }
573
574        if self._default_reparse_document_type:
575            parser_instructions["documentType"] = self._default_reparse_document_type
576
577        ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata
578        if hasattr(self, "neutral_citation"):
579            parser_instructions["metadata"]["cite"] = self.neutral_citation
580
581        request_parse(
582            uri=self.uri,
583            reference=self.consignment_reference,
584            parser_instructions=parser_instructions,
585        )
586
587    def reparse(self) -> bool:
588        # note that we set 'last_sent_to_parser' even if we can't send it to the parser
589        # it means 'last tried to reparse' much more consistently.
590        now = datetime.datetime.now(datetime.timezone.utc)
591        self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
592        if self.can_reparse:
593            self.force_reparse()
594            return True
595        return False
596
597    @cached_property
598    def can_reparse(self) -> bool:
599        """
600        Is it sensible to reparse this document?
601        """
602        return self.docx_exists() and not self.body.has_external_data
603
604    @cached_property
605    def can_enrich(self) -> bool:
606        """
607        Is it possible to enrich this document?
608        """
609        return self.body.has_content and not self.body.has_external_data
610
611    def validate_identifiers(self) -> SuccessFailureMessageTuple:
612        return self.identifiers.perform_all_validations(document_type=type(self), api_client=self.api_client)
613
614    def save_identifiers(self) -> None:
615        """Validate the identifiers, and if the validation passes save them to MarkLogic"""
616        validations = self.validate_identifiers()
617        if validations.success is True:
618            self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree)
619        else:
620            raise IdentifierValidationException(
621                "Unable to save identifiers; validation constraints not met: " + ", ".join(validations.messages)
622            )
623
624    def __getattr__(self, name: str) -> Any:
625        warnings.warn(f"{name} no longer exists on Document, using Document.body instead", DeprecationWarning)
626        try:
627            return getattr(self.body, name)
628        except Exception:
629            raise AttributeError(f"Neither 'Document' nor 'DocumentBody' objects have an attribute '{name}'")
630
631    def linked_document_resolutions(self, namespaces: list[str], only_published: bool = True) -> IdentifierResolutions:
632        """Get document resolutions which share the same neutral citation as this document."""
633        if not hasattr(self, "neutral_citation") or not self.neutral_citation:
634            return IdentifierResolutions([])
635
636        resolutions = self.api_client.resolve_from_identifier_value(self.neutral_citation)
637        if only_published:
638            resolutions = resolutions.published()
639
640        # only documents which aren't this one and have a right namespace
641        return IdentifierResolutions(
642            [
643                resolution
644                for resolution in resolutions
645                if resolution.document_uri != self.uri.as_marklogic() and resolution.identifier_namespace in namespaces
646            ]
647        )
648
649    def linked_documents(self, namespaces: list[str], only_published: bool = True) -> list["Document"]:
650        resolutions = self.linked_document_resolutions(namespaces=namespaces, only_published=only_published)
651        return [
652            Document(resolution.document_uri.as_document_uri(), api_client=self.api_client)
653            for resolution in resolutions
654        ]
655
656    def content_as_html(self) -> str | None:
657        xlst_image_location = os.getenv("XSLT_IMAGE_LOCATION", "")
658        return self.body.content_html(f"{xlst_image_location}/{self.uri}")
659
660    def xml_with_correct_frbr(self) -> bytes:
661        """Dynamically modify FRBR uris to reflect current storage location and FCL id"""
662        fcl_identifiers = self.identifiers.of_type(FindCaseLawIdentifier)
663        work_uri = f"https://caselaw.nationalarchives.gov.uk/id/doc/{fcl_identifiers[0].value}"
664        expression_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}"
665        manifestation_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}/data.xml"
666        return self.body.apply_xslt(
667            "modify_xml_live.xsl", work_uri=work_uri, expression_uri=expression_uri, manifestation_uri=manifestation_uri
668        )
669
670    def compare_to(self, that_doc: "Document") -> comparison.Comparison:
671        return comparison.Comparison(self, that_doc)

MINIMUM_ENRICHMENT_TIME = datetime.timedelta(seconds=1200)

class GatewayTimeoutGettingHTMLWithQuery(builtins.RuntimeWarning): View Source

48class GatewayTimeoutGettingHTMLWithQuery(RuntimeWarning):
49    pass

Base class for warnings about dubious runtime behavior.

DOCUMENT_COLLECTION_URI_JUDGMENT = 'judgment'

DOCUMENT_COLLECTION_URI_PRESS_SUMMARY = 'press-summary'

class Document: View Source

 59class Document:
 60    """
 61    A base class from which all other document types are extensions. This class includes the essential methods for
 62    retrieving and manipulating a document within MarkLogic.
 63    """
 64
 65    document_noun = "document"
 66    """ The noun for a single instance of this document type. """
 67
 68    document_noun_plural = "documents"
 69    """ The noun for a plural of this document type. """
 70
 71    _default_reparse_document_type: Optional[str] = None
 72    """ The default noun to pass to the parser when reparsing given the document type if known. This is used to determine how the document should be parsed and processed."""
 73
 74    type_collection_name: str
 75
 76    attributes_to_validate: list[tuple[str, bool, str]] = [
 77        (
 78            "is_failure",
 79            False,
 80            "This document failed to parse",
 81        ),
 82        (
 83            "is_parked",
 84            False,
 85            "This {document_noun} is currently parked at a temporary URI",
 86        ),
 87        (
 88            "is_held",
 89            False,
 90            "This {document_noun} is currently on hold",
 91        ),
 92        (
 93            "has_name",
 94            True,
 95            "This {document_noun} has no name",
 96        ),
 97        (
 98            "has_valid_court",
 99            True,
100            "The court for this {document_noun} is not valid",
101        ),
102        (
103            "has_unique_content_hash",
104            True,
105            "There is another document with identical content",
106        ),
107        (
108            "has_only_clean_assets",
109            True,
110            "An uncleaned asset exists for this document",
111        ),
112    ]
113    """
114    A list of tuples in the form:
115
116    ``` python
117    (
118        attribute_name,
119        passing_condition,
120        error_message,
121    )
122    ```
123
124    describing attributes which should be checked in order for a document to be considered valid.
125
126    Individual document classes should extend this list where necessary to validate document type-specific attributes.
127    """
128
129    def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None):
130        """
131        :param uri: The URI of the document to retrieve from MarkLogic.
132        :param api_client: An instance of the API client object to handle communication with the MarkLogic server.
133        :param search_query: Optionally, a search string which should be highlighted if it appears in the document body.
134
135        :raises DocumentNotFoundError: The document does not exist within MarkLogic
136        """
137        self.uri: DocumentURIString = uri
138        self.api_client: MarklogicApiClient = api_client
139        if not self.document_exists():
140            raise DocumentNotFoundError(f"Document {self.uri} does not exist")
141
142        self.body: DocumentBody = DocumentBody(
143            xml_bytestring=self.api_client.get_judgment_xml_bytestring(
144                self.uri,
145                show_unpublished=True,
146                search_query=search_query,
147            ),
148        )
149        """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """
150
151        self._initialise_identifiers()
152
153    def __repr__(self) -> str:
154        name = self.body.name or "un-named"
155        return f"<{self.document_noun} {self.uri}: {name}>"
156
157    def document_exists(self) -> bool:
158        """Helper method to verify the existence of a document within MarkLogic.
159
160        :return: `True` if the document exists, `False` otherwise."""
161        return self.api_client.document_exists(self.uri)
162
163    def docx_exists(self) -> bool:
164        """There is a docx in S3 private bucket for this Document"""
165        return check_docx_exists(self.uri)
166
167    def _initialise_identifiers(self) -> None:
168        """Load this document's identifiers from MarkLogic."""
169
170        identifiers_element_as_etree = self.api_client.get_property_as_node(self.uri, "identifiers")
171        self.identifiers = unpack_all_identifiers_from_etree(identifiers_element_as_etree)
172
173    @property
174    def best_human_identifier(self) -> Optional[Identifier]:
175        """Return the preferred identifier for the document, providing that it is considered human readable."""
176        preferred_identifier = self.identifiers.preferred()
177        if preferred_identifier and preferred_identifier.schema.human_readable:
178            return preferred_identifier
179        return None
180
181    @property
182    def public_uri(self) -> str:
183        """
184        :return: The absolute, public URI at which a copy of this document can be found
185        """
186        return f"https://caselaw.nationalarchives.gov.uk/{self.slug}"
187
188    @cached_property
189    def slug(self) -> str:
190        """
191        :return: The best public-facing URL for the judgment, which is the slug
192        of the most-preferred identifier, which should either be an NCN or fclid.
193        """
194        preferred_identifier = self.identifiers.preferred()
195        if preferred_identifier:
196            return preferred_identifier.url_slug
197        msg = f"No preferred identifier exists for {self.uri}"
198        raise RuntimeError(msg)
199
200    @cached_property
201    def is_published(self) -> bool:
202        return self.api_client.get_published(self.uri)
203
204    @cached_property
205    def is_held(self) -> bool:
206        return self.api_client.get_property(self.uri, "editor-hold") == "true"
207
208    @cached_property
209    def is_locked(self) -> bool:
210        return self.checkout_message is not None
211
212    @cached_property
213    def checkout_message(self) -> Optional[str]:
214        return self.api_client.get_judgment_checkout_status_message(self.uri)
215
216    @cached_property
217    def source_name(self) -> str:
218        return self.api_client.get_property(self.uri, "source-name")
219
220    @cached_property
221    def source_email(self) -> str:
222        return self.api_client.get_property(self.uri, "source-email")
223
224    @cached_property
225    def consignment_reference(self) -> str:
226        return self.api_client.get_property(self.uri, "transfer-consignment-reference")
227
228    @property
229    def docx_url(self) -> str:
230        """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts."""
231        return generate_docx_url(self.uri)
232
233    @property
234    def pdf_url(self) -> str:
235        """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts."""
236        return generate_pdf_url(self.uri)
237
238    @cached_property
239    def assigned_to(self) -> str:
240        return self.api_client.get_property(self.uri, "assigned-to")
241
242    @cached_property
243    def versions(self) -> list[VersionsDict]:
244        versions_response = self.api_client.list_judgment_versions(self.uri)
245
246        try:
247            decoded_versions = decoder.MultipartDecoder.from_response(versions_response)
248            return render_versions(decoded_versions.parts)
249        except AttributeError:
250            return []
251
252    @cached_property
253    def versions_as_documents(self) -> list["Document"]:
254        """
255        Returns a list of `Document` subclasses corresponding to the versions of the document. The first entry is:
256           * the most recent
257           * the highest numbered
258
259        Note that this is only valid on the managed document -- a `DLS-DOCUMENTVERSION` error will occur if the document
260        this is called on is itself a version.
261        """
262        if self.is_version:
263            raise NotSupportedOnVersion(
264                f"Cannot get versions of a version for {self.uri}",
265            )
266        docs = []
267        for version in self.versions:
268            doc_uri = DocumentURIString(version["uri"])
269            docs.append(self.api_client.get_document_by_uri(doc_uri))
270        return docs
271
272    @cached_property
273    def version_number(self) -> int:
274        """
275        Note that the highest number is the most recent version.
276        Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version)
277        """
278        version = extract_version(self.uri)
279        if version == 0:
280            raise OnlySupportedOnVersion(
281                f"Version number requested for {self.uri} which is not a version",
282            )
283        return version
284
285    @cached_property
286    def is_version(self) -> bool:
287        "Is this document a potentially historic version of a document, or is it the main document itself?"
288        return extract_version(self.uri) != 0
289
290    @cached_property
291    def is_failure(self) -> bool:
292        """
293        Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if:
294
295        - The document entirely failed to parse
296
297        :return: `True` if this document is in a 'failure' state, otherwise `False`
298        """
299        return self.body.failed_to_parse
300
301    @cached_property
302    def is_parked(self) -> bool:
303        return "parked" in self.uri
304
305    @cached_property
306    def has_name(self) -> bool:
307        return bool(self.body.name)
308
309    @cached_property
310    def has_valid_court(self) -> bool:
311        try:
312            return bool(
313                courts.get_by_code(self.body.court_and_jurisdiction_identifier_string),
314            )
315        except CourtNotFoundException:
316            return False
317
318    @cached_property
319    def is_publishable(self) -> bool:
320        # If there are any validation failures, there will be no messages in the list.
321        # An empty list (which is falsy) therefore means the judgment can be published safely.
322        return not self.validation_failure_messages
323
324    @cached_property
325    def first_published_datetime(self) -> Optional[datetime.datetime]:
326        """
327        Return the database value for the date and time this document was first published.
328
329        :return: The datetime value in the database for "first published".
330        """
331        return self.api_client.get_datetime_property(self.uri, "first_published_datetime")
332
333    @cached_property
334    def first_published_datetime_display(self) -> Optional[datetime.datetime]:
335        """
336        Return the display value for the date and time this document was first published.
337
338        A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return `None`. This can be used alongside `has_ever_been_published` to indicate an "unknown" state.
339
340        :return: The datetime value to be displayed to end users for "first published".
341        """
342
343        if self.first_published_datetime == datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc):
344            return None
345
346        return self.first_published_datetime
347
348    @cached_property
349    def has_ever_been_published(self) -> bool:
350        """
351        Do we consider this document to have ever been published?
352
353        This is `True` if either the document is currently published, or if `first_published_datetime` has any value (including the sentinel value).
354
355        :return: A boolean indicating if the document has ever been published.
356        """
357        return self.is_published or self.first_published_datetime is not None
358
359    @cached_property
360    def validation_failure_messages(self) -> list[str]:
361        exception_list = []
362        for function_name, pass_value, message in self.attributes_to_validate:
363            if getattr(self, function_name) != pass_value:
364                exception_list.append(message.format(document_noun=self.document_noun))
365        return sorted(exception_list)
366
367    @cached_property
368    def annotation(self) -> str:
369        return self.api_client.get_version_annotation(self.uri)
370
371    @cached_property
372    def structured_annotation(self) -> AnnotationDataDict:
373        annotation_data_dict_loader = TypeAdapter(AnnotationDataDict)
374
375        return annotation_data_dict_loader.validate_json(self.annotation)
376
377    @cached_property
378    def has_unique_content_hash(self) -> bool:
379        """Check if the content hash of this document is unique compared to all other documents in MarkLogic."""
380        return self.api_client.has_unique_content_hash(self.uri)
381
382    @cached_property
383    def has_only_clean_assets(self) -> bool:
384        """False if any non-tar.gz assets associated with this document have not been cleaned."""
385        return True  # TODO: Remove this once we have enabled the asset cleaning pipeline.
386        return are_unpublished_assets_clean(self.uri)
387
388    @cached_property
389    def version_created_datetime(self) -> datetime.datetime:
390        return self.api_client.get_version_created_datetime(self.uri)
391
392    @property
393    def status(self) -> str:
394        if self.is_published:
395            return DOCUMENT_STATUS_PUBLISHED
396
397        if self.is_held:
398            return DOCUMENT_STATUS_HOLD
399
400        if self.assigned_to:
401            return DOCUMENT_STATUS_IN_PROGRESS
402
403        return DOCUMENT_STATUS_NEW
404
405    def force_enrich(self) -> None:
406        """
407        Request enrichment of the document, but do no checks
408        """
409        now = datetime.datetime.now(datetime.timezone.utc)
410        self.api_client.set_property(
411            self.uri,
412            "last_sent_to_enrichment",
413            now.isoformat(),
414        )
415
416        if not self.can_enrich:
417            msg = f"{self.uri} cannot be enriched"
418            raise CannotEnrichUnenrichableDocument(msg)
419
420        announce_document_event(
421            uri=self.uri,
422            status="enrich",
423            enrich=True,
424        )
425
426    def enrich(self, even_if_recent: bool = False, accept_failures: bool = False) -> bool:
427        """
428        Request enrichment of a document, if it's sensible to do so.
429        """
430        if not (even_if_recent) and self.enriched_recently:
431            print("Enrichment not requested as document was enriched recently")
432            return False
433
434        print("Enrichment requested")
435
436        try:
437            self.force_enrich()
438        except CannotEnrichUnenrichableDocument as e:
439            if not accept_failures:
440                raise e
441            return False
442
443        return True
444
445    @cached_property
446    def enriched_recently(self) -> bool:
447        """
448        Has this document been enriched recently?
449        """
450
451        last_enrichment = self.body.enrichment_datetime
452        if not last_enrichment:
453            return False
454
455        now = datetime.datetime.now(tz=datetime.timezone.utc)
456
457        return now - last_enrichment < MINIMUM_ENRICHMENT_TIME
458
459    @cached_property
460    def validates_against_schema(self) -> bool:
461        """
462        Does the document validate against the most recent schema?
463        """
464        return self.api_client.validate_document(self.uri)
465
466    def assign_fclid_if_missing(self) -> Optional[FindCaseLawIdentifier]:
467        """If the document does not have an FCLID already, mint a new one and save it."""
468        if len(self.identifiers.of_type(FindCaseLawIdentifier)) == 0:
469            document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client)
470            self.identifiers.add(document_fclid)
471            self.save_identifiers()
472            return document_fclid
473
474        return None
475
476    def publish(self) -> None:
477        """
478        Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state.
479
480        :raises CannotPublishUnpublishableDocument: This document has not passed the checks in `is_publishable`, and as
481        such cannot be published.
482        """
483        if not self.is_publishable:
484            raise CannotPublishUnpublishableDocument
485
486        ## Make sure the document has an FCLID
487        self.assign_fclid_if_missing()
488
489        ## Copy the document assets into the appropriate place in S3
490        publish_documents(self.uri)
491
492        ## Set the fact the document is published
493        self.api_client.set_published(self.uri, True)
494
495        ## If necessary, set the first published date
496        if not self.first_published_datetime:
497            self.api_client.set_datetime_property(
498                self.uri, "first_published_datetime", datetime.datetime.now(datetime.timezone.utc)
499            )
500
501        ## Announce the publication on the event bus
502        announce_document_event(
503            uri=self.uri,
504            status="publish",
505        )
506
507        ## Send the document off for enrichment, but accept if we can't for any reason
508        self.enrich(accept_failures=True)
509
510    def unpublish(self) -> None:
511        self.api_client.break_checkout(self.uri)
512        unpublish_documents(self.uri)
513        self.api_client.set_published(self.uri, False)
514        announce_document_event(
515            uri=self.uri,
516            status="unpublish",
517        )
518
519    def hold(self) -> None:
520        self.api_client.set_property(self.uri, "editor-hold", "true")
521
522    def unhold(self) -> None:
523        self.api_client.set_property(self.uri, "editor-hold", "false")
524
525    @cached_property
526    def safe_to_delete(self) -> bool:
527        """
528        Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available.
529
530        :return: If the document is safe to be deleted
531        """
532
533        return not self.is_published
534
535    def delete(self) -> None:
536        """
537        Deletes this document from MarkLogic and any resources from AWS.
538        """
539
540        if self.safe_to_delete:
541            self.api_client.delete_judgment(self.uri)
542            delete_documents_from_private_bucket(self.uri)
543        else:
544            raise DocumentNotSafeForDeletion
545
546    def move(self, new_citation: NeutralCitationString) -> None:
547        self.api_client.update_document_uri(self.uri, new_citation)
548
549    def force_reparse(self) -> None:
550        "Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI"
551
552        now = datetime.datetime.now(datetime.timezone.utc)
553        self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
554
555        checked_date: Optional[str] = (
556            self.body.document_date_as_date.isoformat()
557            if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1)
558            else None
559        )
560
561        # the keys of parser_instructions should exactly match the parser output
562        # in the *-metadata.json files by the parser. Whilst typically empty
563        # values are "" from the API, we should pass None instead in this case.
564
565        parser_instructions: ParserInstructionsDict = {
566            "metadata": {
567                "name": self.body.name or None,
568                "cite": None,
569                "court": self.body.court or None,
570                "date": checked_date,
571                "uri": self.uri,
572            }
573        }
574
575        if self._default_reparse_document_type:
576            parser_instructions["documentType"] = self._default_reparse_document_type
577
578        ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata
579        if hasattr(self, "neutral_citation"):
580            parser_instructions["metadata"]["cite"] = self.neutral_citation
581
582        request_parse(
583            uri=self.uri,
584            reference=self.consignment_reference,
585            parser_instructions=parser_instructions,
586        )
587
588    def reparse(self) -> bool:
589        # note that we set 'last_sent_to_parser' even if we can't send it to the parser
590        # it means 'last tried to reparse' much more consistently.
591        now = datetime.datetime.now(datetime.timezone.utc)
592        self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
593        if self.can_reparse:
594            self.force_reparse()
595            return True
596        return False
597
598    @cached_property
599    def can_reparse(self) -> bool:
600        """
601        Is it sensible to reparse this document?
602        """
603        return self.docx_exists() and not self.body.has_external_data
604
605    @cached_property
606    def can_enrich(self) -> bool:
607        """
608        Is it possible to enrich this document?
609        """
610        return self.body.has_content and not self.body.has_external_data
611
612    def validate_identifiers(self) -> SuccessFailureMessageTuple:
613        return self.identifiers.perform_all_validations(document_type=type(self), api_client=self.api_client)
614
615    def save_identifiers(self) -> None:
616        """Validate the identifiers, and if the validation passes save them to MarkLogic"""
617        validations = self.validate_identifiers()
618        if validations.success is True:
619            self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree)
620        else:
621            raise IdentifierValidationException(
622                "Unable to save identifiers; validation constraints not met: " + ", ".join(validations.messages)
623            )
624
625    def __getattr__(self, name: str) -> Any:
626        warnings.warn(f"{name} no longer exists on Document, using Document.body instead", DeprecationWarning)
627        try:
628            return getattr(self.body, name)
629        except Exception:
630            raise AttributeError(f"Neither 'Document' nor 'DocumentBody' objects have an attribute '{name}'")
631
632    def linked_document_resolutions(self, namespaces: list[str], only_published: bool = True) -> IdentifierResolutions:
633        """Get document resolutions which share the same neutral citation as this document."""
634        if not hasattr(self, "neutral_citation") or not self.neutral_citation:
635            return IdentifierResolutions([])
636
637        resolutions = self.api_client.resolve_from_identifier_value(self.neutral_citation)
638        if only_published:
639            resolutions = resolutions.published()
640
641        # only documents which aren't this one and have a right namespace
642        return IdentifierResolutions(
643            [
644                resolution
645                for resolution in resolutions
646                if resolution.document_uri != self.uri.as_marklogic() and resolution.identifier_namespace in namespaces
647            ]
648        )
649
650    def linked_documents(self, namespaces: list[str], only_published: bool = True) -> list["Document"]:
651        resolutions = self.linked_document_resolutions(namespaces=namespaces, only_published=only_published)
652        return [
653            Document(resolution.document_uri.as_document_uri(), api_client=self.api_client)
654            for resolution in resolutions
655        ]
656
657    def content_as_html(self) -> str | None:
658        xlst_image_location = os.getenv("XSLT_IMAGE_LOCATION", "")
659        return self.body.content_html(f"{xlst_image_location}/{self.uri}")
660
661    def xml_with_correct_frbr(self) -> bytes:
662        """Dynamically modify FRBR uris to reflect current storage location and FCL id"""
663        fcl_identifiers = self.identifiers.of_type(FindCaseLawIdentifier)
664        work_uri = f"https://caselaw.nationalarchives.gov.uk/id/doc/{fcl_identifiers[0].value}"
665        expression_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}"
666        manifestation_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}/data.xml"
667        return self.body.apply_xslt(
668            "modify_xml_live.xsl", work_uri=work_uri, expression_uri=expression_uri, manifestation_uri=manifestation_uri
669        )
670
671    def compare_to(self, that_doc: "Document") -> comparison.Comparison:
672        return comparison.Comparison(self, that_doc)

A base class from which all other document types are extensions. This class includes the essential methods for retrieving and manipulating a document within MarkLogic.

Document( uri: caselawclient.types.DocumentURIString, api_client: caselawclient.Client.MarklogicApiClient, search_query: Optional[str] = None) View Source

129    def __init__(self, uri: DocumentURIString, api_client: "MarklogicApiClient", search_query: Optional[str] = None):
130        """
131        :param uri: The URI of the document to retrieve from MarkLogic.
132        :param api_client: An instance of the API client object to handle communication with the MarkLogic server.
133        :param search_query: Optionally, a search string which should be highlighted if it appears in the document body.
134
135        :raises DocumentNotFoundError: The document does not exist within MarkLogic
136        """
137        self.uri: DocumentURIString = uri
138        self.api_client: MarklogicApiClient = api_client
139        if not self.document_exists():
140            raise DocumentNotFoundError(f"Document {self.uri} does not exist")
141
142        self.body: DocumentBody = DocumentBody(
143            xml_bytestring=self.api_client.get_judgment_xml_bytestring(
144                self.uri,
145                show_unpublished=True,
146                search_query=search_query,
147            ),
148        )
149        """ `Document.body` represents the body of the document itself, without any information such as version tracking or properties. """
150
151        self._initialise_identifiers()

Parameters

uri: The URI of the document to retrieve from MarkLogic.
api_client: An instance of the API client object to handle communication with the MarkLogic server.
search_query: Optionally, a search string which should be highlighted if it appears in the document body.

Raises

DocumentNotFoundError: The document does not exist within MarkLogic

document_noun = 'document'

The noun for a single instance of this document type.

document_noun_plural = 'documents'

The noun for a plural of this document type.

type_collection_name: str

attributes_to_validate: list[tuple[str, bool, str]] = [('is_failure', False, 'This document failed to parse'), ('is_parked', False, 'This {document_noun} is currently parked at a temporary URI'), ('is_held', False, 'This {document_noun} is currently on hold'), ('has_name', True, 'This {document_noun} has no name'), ('has_valid_court', True, 'The court for this {document_noun} is not valid'), ('has_unique_content_hash', True, 'There is another document with identical content'), ('has_only_clean_assets', True, 'An uncleaned asset exists for this document')]

A list of tuples in the form:

(
    attribute_name,
    passing_condition,
    error_message,
)

describing attributes which should be checked in order for a document to be considered valid.

Individual document classes should extend this list where necessary to validate document type-specific attributes.

uri: caselawclient.types.DocumentURIString

api_client: caselawclient.Client.MarklogicApiClient

body: caselawclient.models.documents.body.DocumentBody

Document.body represents the body of the document itself, without any information such as version tracking or properties.

def document_exists(self) -> bool: View Source

157    def document_exists(self) -> bool:
158        """Helper method to verify the existence of a document within MarkLogic.
159
160        :return: `True` if the document exists, `False` otherwise."""
161        return self.api_client.document_exists(self.uri)

Helper method to verify the existence of a document within MarkLogic.

Returns

True if the document exists, False otherwise.

def docx_exists(self) -> bool: View Source

163    def docx_exists(self) -> bool:
164        """There is a docx in S3 private bucket for this Document"""
165        return check_docx_exists(self.uri)

There is a docx in S3 private bucket for this Document

best_human_identifier: Optional[caselawclient.models.identifiers.Identifier] View Source

173    @property
174    def best_human_identifier(self) -> Optional[Identifier]:
175        """Return the preferred identifier for the document, providing that it is considered human readable."""
176        preferred_identifier = self.identifiers.preferred()
177        if preferred_identifier and preferred_identifier.schema.human_readable:
178            return preferred_identifier
179        return None

Return the preferred identifier for the document, providing that it is considered human readable.

public_uri: str View Source

181    @property
182    def public_uri(self) -> str:
183        """
184        :return: The absolute, public URI at which a copy of this document can be found
185        """
186        return f"https://caselaw.nationalarchives.gov.uk/{self.slug}"

Returns

The absolute, public URI at which a copy of this document can be found

slug: str View Source

188    @cached_property
189    def slug(self) -> str:
190        """
191        :return: The best public-facing URL for the judgment, which is the slug
192        of the most-preferred identifier, which should either be an NCN or fclid.
193        """
194        preferred_identifier = self.identifiers.preferred()
195        if preferred_identifier:
196            return preferred_identifier.url_slug
197        msg = f"No preferred identifier exists for {self.uri}"
198        raise RuntimeError(msg)

Returns

The best public-facing URL for the judgment, which is the slug of the most-preferred identifier, which should either be an NCN or fclid.

is_published: bool View Source

200    @cached_property
201    def is_published(self) -> bool:
202        return self.api_client.get_published(self.uri)

is_held: bool View Source

204    @cached_property
205    def is_held(self) -> bool:
206        return self.api_client.get_property(self.uri, "editor-hold") == "true"

is_locked: bool View Source

208    @cached_property
209    def is_locked(self) -> bool:
210        return self.checkout_message is not None

checkout_message: Optional[str] View Source

212    @cached_property
213    def checkout_message(self) -> Optional[str]:
214        return self.api_client.get_judgment_checkout_status_message(self.uri)

source_name: str View Source

216    @cached_property
217    def source_name(self) -> str:
218        return self.api_client.get_property(self.uri, "source-name")

source_email: str View Source

220    @cached_property
221    def source_email(self) -> str:
222        return self.api_client.get_property(self.uri, "source-email")

consignment_reference: str View Source

224    @cached_property
225    def consignment_reference(self) -> str:
226        return self.api_client.get_property(self.uri, "transfer-consignment-reference")

docx_url: str View Source

228    @property
229    def docx_url(self) -> str:
230        """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts."""
231        return generate_docx_url(self.uri)

This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.

pdf_url: str View Source

233    @property
234    def pdf_url(self) -> str:
235        """This generates a signed link to the unpublished S3 bucket and should not be used in public contexts."""
236        return generate_pdf_url(self.uri)

This generates a signed link to the unpublished S3 bucket and should not be used in public contexts.

assigned_to: str View Source

238    @cached_property
239    def assigned_to(self) -> str:
240        return self.api_client.get_property(self.uri, "assigned-to")

versions: list[caselawclient.models.utilities.VersionsDict] View Source

242    @cached_property
243    def versions(self) -> list[VersionsDict]:
244        versions_response = self.api_client.list_judgment_versions(self.uri)
245
246        try:
247            decoded_versions = decoder.MultipartDecoder.from_response(versions_response)
248            return render_versions(decoded_versions.parts)
249        except AttributeError:
250            return []

versions_as_documents: list[Document] View Source

252    @cached_property
253    def versions_as_documents(self) -> list["Document"]:
254        """
255        Returns a list of `Document` subclasses corresponding to the versions of the document. The first entry is:
256           * the most recent
257           * the highest numbered
258
259        Note that this is only valid on the managed document -- a `DLS-DOCUMENTVERSION` error will occur if the document
260        this is called on is itself a version.
261        """
262        if self.is_version:
263            raise NotSupportedOnVersion(
264                f"Cannot get versions of a version for {self.uri}",
265            )
266        docs = []
267        for version in self.versions:
268            doc_uri = DocumentURIString(version["uri"])
269            docs.append(self.api_client.get_document_by_uri(doc_uri))
270        return docs

Returns a list of Document subclasses corresponding to the versions of the document. The first entry is:

the most recent
the highest numbered

Note that this is only valid on the managed document -- a DLS-DOCUMENTVERSION error will occur if the document this is called on is itself a version.

version_number: int View Source

272    @cached_property
273    def version_number(self) -> int:
274        """
275        Note that the highest number is the most recent version.
276        Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version)
277        """
278        version = extract_version(self.uri)
279        if version == 0:
280            raise OnlySupportedOnVersion(
281                f"Version number requested for {self.uri} which is not a version",
282            )
283        return version

Note that the highest number is the most recent version. Raises an exception if it is not a version (e.g. /2022/eat/1 is not a version)

is_version: bool View Source

285    @cached_property
286    def is_version(self) -> bool:
287        "Is this document a potentially historic version of a document, or is it the main document itself?"
288        return extract_version(self.uri) != 0

Is this document a potentially historic version of a document, or is it the main document itself?

is_failure: bool View Source

290    @cached_property
291    def is_failure(self) -> bool:
292        """
293        Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if:
294
295        - The document entirely failed to parse
296
297        :return: `True` if this document is in a 'failure' state, otherwise `False`
298        """
299        return self.body.failed_to_parse

Is this document in a 'failure' state from which no recovery is possible? This is considered to be the case if:

The document entirely failed to parse

Returns

True if this document is in a 'failure' state, otherwise False

is_parked: bool View Source

301    @cached_property
302    def is_parked(self) -> bool:
303        return "parked" in self.uri

has_name: bool View Source

305    @cached_property
306    def has_name(self) -> bool:
307        return bool(self.body.name)

has_valid_court: bool View Source

309    @cached_property
310    def has_valid_court(self) -> bool:
311        try:
312            return bool(
313                courts.get_by_code(self.body.court_and_jurisdiction_identifier_string),
314            )
315        except CourtNotFoundException:
316            return False

is_publishable: bool View Source

318    @cached_property
319    def is_publishable(self) -> bool:
320        # If there are any validation failures, there will be no messages in the list.
321        # An empty list (which is falsy) therefore means the judgment can be published safely.
322        return not self.validation_failure_messages

first_published_datetime: Optional[datetime.datetime] View Source

324    @cached_property
325    def first_published_datetime(self) -> Optional[datetime.datetime]:
326        """
327        Return the database value for the date and time this document was first published.
328
329        :return: The datetime value in the database for "first published".
330        """
331        return self.api_client.get_datetime_property(self.uri, "first_published_datetime")

Return the database value for the date and time this document was first published.

Returns

The datetime value in the database for "first published".

first_published_datetime_display: Optional[datetime.datetime] View Source

333    @cached_property
334    def first_published_datetime_display(self) -> Optional[datetime.datetime]:
335        """
336        Return the display value for the date and time this document was first published.
337
338        A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return `None`. This can be used alongside `has_ever_been_published` to indicate an "unknown" state.
339
340        :return: The datetime value to be displayed to end users for "first published".
341        """
342
343        if self.first_published_datetime == datetime.datetime(1970, 1, 1, 0, 0, tzinfo=datetime.timezone.utc):
344            return None
345
346        return self.first_published_datetime

Return the display value for the date and time this document was first published.

A value of 1970-01-01 00:00 indicates that the document has been published previously, but the exact date and time is unknown. In this case, return None. This can be used alongside has_ever_been_published to indicate an "unknown" state.

Returns

The datetime value to be displayed to end users for "first published".

has_ever_been_published: bool View Source

348    @cached_property
349    def has_ever_been_published(self) -> bool:
350        """
351        Do we consider this document to have ever been published?
352
353        This is `True` if either the document is currently published, or if `first_published_datetime` has any value (including the sentinel value).
354
355        :return: A boolean indicating if the document has ever been published.
356        """
357        return self.is_published or self.first_published_datetime is not None

Do we consider this document to have ever been published?

This is True if either the document is currently published, or if first_published_datetime has any value (including the sentinel value).

Returns

A boolean indicating if the document has ever been published.

validation_failure_messages: list[str] View Source

359    @cached_property
360    def validation_failure_messages(self) -> list[str]:
361        exception_list = []
362        for function_name, pass_value, message in self.attributes_to_validate:
363            if getattr(self, function_name) != pass_value:
364                exception_list.append(message.format(document_noun=self.document_noun))
365        return sorted(exception_list)

annotation: str View Source

367    @cached_property
368    def annotation(self) -> str:
369        return self.api_client.get_version_annotation(self.uri)

structured_annotation: caselawclient.models.documents.versions.AnnotationDataDict View Source

371    @cached_property
372    def structured_annotation(self) -> AnnotationDataDict:
373        annotation_data_dict_loader = TypeAdapter(AnnotationDataDict)
374
375        return annotation_data_dict_loader.validate_json(self.annotation)

has_unique_content_hash: bool View Source

377    @cached_property
378    def has_unique_content_hash(self) -> bool:
379        """Check if the content hash of this document is unique compared to all other documents in MarkLogic."""
380        return self.api_client.has_unique_content_hash(self.uri)

Check if the content hash of this document is unique compared to all other documents in MarkLogic.

has_only_clean_assets: bool View Source

382    @cached_property
383    def has_only_clean_assets(self) -> bool:
384        """False if any non-tar.gz assets associated with this document have not been cleaned."""
385        return True  # TODO: Remove this once we have enabled the asset cleaning pipeline.
386        return are_unpublished_assets_clean(self.uri)

False if any non-tar.gz assets associated with this document have not been cleaned.

version_created_datetime: datetime.datetime View Source

388    @cached_property
389    def version_created_datetime(self) -> datetime.datetime:
390        return self.api_client.get_version_created_datetime(self.uri)

status: str View Source

392    @property
393    def status(self) -> str:
394        if self.is_published:
395            return DOCUMENT_STATUS_PUBLISHED
396
397        if self.is_held:
398            return DOCUMENT_STATUS_HOLD
399
400        if self.assigned_to:
401            return DOCUMENT_STATUS_IN_PROGRESS
402
403        return DOCUMENT_STATUS_NEW

def force_enrich(self) -> None: View Source

405    def force_enrich(self) -> None:
406        """
407        Request enrichment of the document, but do no checks
408        """
409        now = datetime.datetime.now(datetime.timezone.utc)
410        self.api_client.set_property(
411            self.uri,
412            "last_sent_to_enrichment",
413            now.isoformat(),
414        )
415
416        if not self.can_enrich:
417            msg = f"{self.uri} cannot be enriched"
418            raise CannotEnrichUnenrichableDocument(msg)
419
420        announce_document_event(
421            uri=self.uri,
422            status="enrich",
423            enrich=True,
424        )

Request enrichment of the document, but do no checks

def enrich( self, even_if_recent: bool = False, accept_failures: bool = False) -> bool: View Source

426    def enrich(self, even_if_recent: bool = False, accept_failures: bool = False) -> bool:
427        """
428        Request enrichment of a document, if it's sensible to do so.
429        """
430        if not (even_if_recent) and self.enriched_recently:
431            print("Enrichment not requested as document was enriched recently")
432            return False
433
434        print("Enrichment requested")
435
436        try:
437            self.force_enrich()
438        except CannotEnrichUnenrichableDocument as e:
439            if not accept_failures:
440                raise e
441            return False
442
443        return True

Request enrichment of a document, if it's sensible to do so.

enriched_recently: bool View Source

445    @cached_property
446    def enriched_recently(self) -> bool:
447        """
448        Has this document been enriched recently?
449        """
450
451        last_enrichment = self.body.enrichment_datetime
452        if not last_enrichment:
453            return False
454
455        now = datetime.datetime.now(tz=datetime.timezone.utc)
456
457        return now - last_enrichment < MINIMUM_ENRICHMENT_TIME

Has this document been enriched recently?

validates_against_schema: bool View Source

459    @cached_property
460    def validates_against_schema(self) -> bool:
461        """
462        Does the document validate against the most recent schema?
463        """
464        return self.api_client.validate_document(self.uri)

Does the document validate against the most recent schema?

def assign_fclid_if_missing( self) -> Optional[caselawclient.models.identifiers.fclid.FindCaseLawIdentifier]: View Source

466    def assign_fclid_if_missing(self) -> Optional[FindCaseLawIdentifier]:
467        """If the document does not have an FCLID already, mint a new one and save it."""
468        if len(self.identifiers.of_type(FindCaseLawIdentifier)) == 0:
469            document_fclid = FindCaseLawIdentifierSchema.mint(self.api_client)
470            self.identifiers.add(document_fclid)
471            self.save_identifiers()
472            return document_fclid
473
474        return None

If the document does not have an FCLID already, mint a new one and save it.

def publish(self) -> None: View Source

476    def publish(self) -> None:
477        """
478        Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state.
479
480        :raises CannotPublishUnpublishableDocument: This document has not passed the checks in `is_publishable`, and as
481        such cannot be published.
482        """
483        if not self.is_publishable:
484            raise CannotPublishUnpublishableDocument
485
486        ## Make sure the document has an FCLID
487        self.assign_fclid_if_missing()
488
489        ## Copy the document assets into the appropriate place in S3
490        publish_documents(self.uri)
491
492        ## Set the fact the document is published
493        self.api_client.set_published(self.uri, True)
494
495        ## If necessary, set the first published date
496        if not self.first_published_datetime:
497            self.api_client.set_datetime_property(
498                self.uri, "first_published_datetime", datetime.datetime.now(datetime.timezone.utc)
499            )
500
501        ## Announce the publication on the event bus
502        announce_document_event(
503            uri=self.uri,
504            status="publish",
505        )
506
507        ## Send the document off for enrichment, but accept if we can't for any reason
508        self.enrich(accept_failures=True)

Assuming that a document passes pre-publish checks, perform all necessary operations to put it into a published state.

Raises

CannotPublishUnpublishableDocument: This document has not passed the checks in is_publishable, and as such cannot be published.

def unpublish(self) -> None: View Source

510    def unpublish(self) -> None:
511        self.api_client.break_checkout(self.uri)
512        unpublish_documents(self.uri)
513        self.api_client.set_published(self.uri, False)
514        announce_document_event(
515            uri=self.uri,
516            status="unpublish",
517        )

def hold(self) -> None: View Source

519    def hold(self) -> None:
520        self.api_client.set_property(self.uri, "editor-hold", "true")

def unhold(self) -> None: View Source

522    def unhold(self) -> None:
523        self.api_client.set_property(self.uri, "editor-hold", "false")

safe_to_delete: bool View Source

525    @cached_property
526    def safe_to_delete(self) -> bool:
527        """
528        Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available.
529
530        :return: If the document is safe to be deleted
531        """
532
533        return not self.is_published

Determines if a document is in a state where it's safe to be deleted, eg not currently publicly available.

Returns

If the document is safe to be deleted

def delete(self) -> None: View Source

535    def delete(self) -> None:
536        """
537        Deletes this document from MarkLogic and any resources from AWS.
538        """
539
540        if self.safe_to_delete:
541            self.api_client.delete_judgment(self.uri)
542            delete_documents_from_private_bucket(self.uri)
543        else:
544            raise DocumentNotSafeForDeletion

Deletes this document from MarkLogic and any resources from AWS.

def move(self, new_citation: ds_caselaw_utils.types.NeutralCitationString) -> None: View Source

546    def move(self, new_citation: NeutralCitationString) -> None:
547        self.api_client.update_document_uri(self.uri, new_citation)

def force_reparse(self) -> None: View Source

549    def force_reparse(self) -> None:
550        "Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI"
551
552        now = datetime.datetime.now(datetime.timezone.utc)
553        self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
554
555        checked_date: Optional[str] = (
556            self.body.document_date_as_date.isoformat()
557            if self.body.document_date_as_date and self.body.document_date_as_date > datetime.date(1001, 1, 1)
558            else None
559        )
560
561        # the keys of parser_instructions should exactly match the parser output
562        # in the *-metadata.json files by the parser. Whilst typically empty
563        # values are "" from the API, we should pass None instead in this case.
564
565        parser_instructions: ParserInstructionsDict = {
566            "metadata": {
567                "name": self.body.name or None,
568                "cite": None,
569                "court": self.body.court or None,
570                "date": checked_date,
571                "uri": self.uri,
572            }
573        }
574
575        if self._default_reparse_document_type:
576            parser_instructions["documentType"] = self._default_reparse_document_type
577
578        ## TODO: Remove this hack around the fact that NCNs are assumed to be present for all documents' metadata, but actually different document classes may have different metadata
579        if hasattr(self, "neutral_citation"):
580            parser_instructions["metadata"]["cite"] = self.neutral_citation
581
582        request_parse(
583            uri=self.uri,
584            reference=self.consignment_reference,
585            parser_instructions=parser_instructions,
586        )

Send an SNS notification that triggers reparsing, also sending all editor-modifiable metadata and URI

def reparse(self) -> bool: View Source

588    def reparse(self) -> bool:
589        # note that we set 'last_sent_to_parser' even if we can't send it to the parser
590        # it means 'last tried to reparse' much more consistently.
591        now = datetime.datetime.now(datetime.timezone.utc)
592        self.api_client.set_property(self.uri, "last_sent_to_parser", now.isoformat())
593        if self.can_reparse:
594            self.force_reparse()
595            return True
596        return False

can_reparse: bool View Source

598    @cached_property
599    def can_reparse(self) -> bool:
600        """
601        Is it sensible to reparse this document?
602        """
603        return self.docx_exists() and not self.body.has_external_data

Is it sensible to reparse this document?

can_enrich: bool View Source

605    @cached_property
606    def can_enrich(self) -> bool:
607        """
608        Is it possible to enrich this document?
609        """
610        return self.body.has_content and not self.body.has_external_data

Is it possible to enrich this document?

def validate_identifiers(self) -> caselawclient.types.SuccessFailureMessageTuple: View Source

612    def validate_identifiers(self) -> SuccessFailureMessageTuple:
613        return self.identifiers.perform_all_validations(document_type=type(self), api_client=self.api_client)

def save_identifiers(self) -> None: View Source

615    def save_identifiers(self) -> None:
616        """Validate the identifiers, and if the validation passes save them to MarkLogic"""
617        validations = self.validate_identifiers()
618        if validations.success is True:
619            self.api_client.set_property_as_node(self.uri, "identifiers", self.identifiers.as_etree)
620        else:
621            raise IdentifierValidationException(
622                "Unable to save identifiers; validation constraints not met: " + ", ".join(validations.messages)
623            )

Validate the identifiers, and if the validation passes save them to MarkLogic

def linked_document_resolutions( self, namespaces: list[str], only_published: bool = True) -> caselawclient.identifier_resolution.IdentifierResolutions: View Source

632    def linked_document_resolutions(self, namespaces: list[str], only_published: bool = True) -> IdentifierResolutions:
633        """Get document resolutions which share the same neutral citation as this document."""
634        if not hasattr(self, "neutral_citation") or not self.neutral_citation:
635            return IdentifierResolutions([])
636
637        resolutions = self.api_client.resolve_from_identifier_value(self.neutral_citation)
638        if only_published:
639            resolutions = resolutions.published()
640
641        # only documents which aren't this one and have a right namespace
642        return IdentifierResolutions(
643            [
644                resolution
645                for resolution in resolutions
646                if resolution.document_uri != self.uri.as_marklogic() and resolution.identifier_namespace in namespaces
647            ]
648        )

Get document resolutions which share the same neutral citation as this document.

def linked_documents( self, namespaces: list[str], only_published: bool = True) -> list[Document]: View Source

650    def linked_documents(self, namespaces: list[str], only_published: bool = True) -> list["Document"]:
651        resolutions = self.linked_document_resolutions(namespaces=namespaces, only_published=only_published)
652        return [
653            Document(resolution.document_uri.as_document_uri(), api_client=self.api_client)
654            for resolution in resolutions
655        ]

def content_as_html(self) -> str | None: View Source

657    def content_as_html(self) -> str | None:
658        xlst_image_location = os.getenv("XSLT_IMAGE_LOCATION", "")
659        return self.body.content_html(f"{xlst_image_location}/{self.uri}")

def xml_with_correct_frbr(self) -> bytes: View Source

661    def xml_with_correct_frbr(self) -> bytes:
662        """Dynamically modify FRBR uris to reflect current storage location and FCL id"""
663        fcl_identifiers = self.identifiers.of_type(FindCaseLawIdentifier)
664        work_uri = f"https://caselaw.nationalarchives.gov.uk/id/doc/{fcl_identifiers[0].value}"
665        expression_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}"
666        manifestation_uri = f"https://caselaw.nationalarchives.gov.uk/{self.uri.lstrip('/')}/data.xml"
667        return self.body.apply_xslt(
668            "modify_xml_live.xsl", work_uri=work_uri, expression_uri=expression_uri, manifestation_uri=manifestation_uri
669        )

Dynamically modify FRBR uris to reflect current storage location and FCL id

def compare_to( self, that_doc: Document) -> caselawclient.models.documents.comparison.Comparison: View Source

671    def compare_to(self, that_doc: "Document") -> comparison.Comparison:
672        return comparison.Comparison(self, that_doc)