anonymiser_lib/
lib.rs

1//! ## Court document package anonymiser library
2//!
3//! This library contains common code shared between the anonymiser script and the lambda.
4use clap::Parser;
5use docx_rs::*;
6use flate2::{read::GzDecoder, write::GzEncoder, Compression};
7use serde_json::{json, Value};
8use sha256::try_digest;
9
10use std::fs::{remove_file, DirEntry};
11use std::io::ErrorKind;
12use std::{fs, fs::File, io, io::Error, io::Read, path::Path, path::PathBuf};
13use tar::{Archive, Builder};
14
15/// # A struct representing the input arguments
16#[derive(Parser)]
17#[clap(name = "anonymiser")]
18pub struct Opt {
19    /// Input folder
20    #[clap(long, short, value_parser)]
21    pub input: String,
22
23    /// Output folder
24    #[clap(long, short, value_parser)]
25    pub output: String,
26}
27
28/// # Package processor
29/// This takes an output directory path and a path to a tar.gz file as input and anonymises them with the following steps:
30///
31/// * It replaces the values of Contact-Email and Contact-Name with XXXXXXX
32/// * It generates a new docx file which only contains the name of the judgment.
33/// * It updates the checksum field with the calculated checksum of the new docx file.
34/// * It renames the folder and metadata file from TDR-xxx to TST-xxx.
35/// * It creates a new tar.gz folder in the output directory.
36/// * It deletes the uncompressed folder in the output directory.
37pub fn process_package(dir_output: &PathBuf, file: &PathBuf) -> Result<PathBuf, Error> {
38    let tar_gz_file_name: String = file
39        .file_name()
40        .and_then(|name| name.to_os_string().into_string().ok())
41        .ok_or("Error getting the file name from the file")
42        .map_err(|e| Error::new(ErrorKind::InvalidInput, e))?;
43
44    let output_tar_gz_path: PathBuf =
45        Path::new(&dir_output).join(Path::new(&tar_gz_file_name.replace("TDR", "TST")));
46    let uncompressed_folder_input_path: &PathBuf = &file.with_extension("").with_extension("");
47    let input_batch_reference: String = uncompressed_folder_input_path
48        .file_name()
49        .and_then(|name| name.to_str().map(|name| name.replace("TRE-", "")))
50        .ok_or(Error::new(
51            ErrorKind::InvalidInput,
52            "Cannot get a batch reference from the file name",
53        ))?;
54    let output_batch_reference: &String = &input_batch_reference.replace("TDR", "TST");
55
56    let extracted_output_original_name: PathBuf =
57        dir_output.join(PathBuf::from(&input_batch_reference));
58    let extracted_output_path: PathBuf = dir_output.join(PathBuf::from(output_batch_reference));
59
60    let output_path_with_file = |file_name: &str| -> PathBuf {
61        let output_path = extracted_output_path.clone();
62        output_path.join(PathBuf::from(file_name))
63    };
64
65    fs::create_dir_all(extracted_output_path.clone())?;
66
67    decompress_file(file, dir_output)?;
68
69    let metadata_input_file_path: PathBuf =
70        output_path_with_file(format!("TRE-{input_batch_reference}-metadata.json").as_str());
71    let metadata_output_file_path: PathBuf =
72        output_path_with_file(format!("TRE-{output_batch_reference}-metadata.json").as_str());
73
74    if extracted_output_path.exists() {
75        fs::remove_dir_all(&extracted_output_path)?;
76    }
77    fs::rename(extracted_output_original_name, &extracted_output_path)?;
78    fs::rename(metadata_input_file_path, &metadata_output_file_path)?;
79
80    let mut metadata_json_value: Value = parse_metadata_json(&metadata_output_file_path)?;
81
82    let docx_checksum =
83        create_docx_with_checksum(&extracted_output_path, &mut metadata_json_value)?;
84
85    update_json_file(
86        &metadata_output_file_path,
87        docx_checksum,
88        &mut metadata_json_value,
89    )?;
90
91    if_present_delete(output_path_with_file(
92        format!("{input_batch_reference}.xml").as_str(),
93    ))?;
94    if_present_delete(output_path_with_file("parser.log"))?;
95
96    tar_folder(
97        &output_tar_gz_path,
98        &extracted_output_path,
99        output_batch_reference,
100    )?;
101
102    fs::remove_dir_all(&extracted_output_path)?;
103    Ok(output_tar_gz_path)
104}
105
106/// # Creates a docx and returns a checksum
107///
108/// This creates a new docx file with the name parsed from the metadata filename.
109///
110/// It then writes the judgment name to the docx file and saves it to the output directory.
111/// If there is no judgment name, it uses the filename
112///
113/// Finally, it generates a sha256 checksum for the new file and returns it.
114fn create_docx_with_checksum(
115    extracted_output_path: &Path,
116    metadata_json_value: &mut Value,
117) -> Result<String, Error> {
118    let docx_file_name: &str = metadata_json_value["parameters"]["TRE"]["payload"]["filename"]
119        .as_str()
120        .ok_or("'filename' is missing from the metadata json")
121        .map_err(|e| Error::new(ErrorKind::InvalidInput, e))?;
122
123    let judgment_name: &str = metadata_json_value["parameters"]["PARSER"]["name"]
124        .as_str()
125        .unwrap_or(docx_file_name);
126    let docx_path: PathBuf = extracted_output_path.join(PathBuf::from(docx_file_name));
127
128    let file: File = File::create(&docx_path)?;
129    Docx::new()
130        .add_paragraph(Paragraph::new().add_run(Run::new().add_text(judgment_name)))
131        .build()
132        .pack(file)?;
133
134    let docx_checksum: String = try_digest(&docx_path).unwrap();
135    Ok(docx_checksum)
136}
137
138/// # Helper function to delete a file if present
139fn if_present_delete(path: PathBuf) -> io::Result<()> {
140    if path.exists() {
141        remove_file(path)?
142    }
143    Ok(())
144}
145
146/// # Helper function to check if a file does not start with `.`
147fn is_not_hidden(entry: &DirEntry) -> bool {
148    entry
149        .file_name()
150        .to_str()
151        .map(|file_name| !file_name.starts_with('.'))
152        .unwrap_or(false)
153}
154
155/// # Helper function to check if the entry is a file
156fn is_file(entry: &DirEntry) -> bool {
157    !entry.path().is_dir()
158}
159
160/// # List files in input directory
161///
162/// This takes a directory path and returns a list of paths of all files on that level.
163/// It will not recursively search subdirectories.
164pub fn files_in_input_dir(directory_path: &PathBuf) -> Result<Vec<PathBuf>, Error> {
165    let path_list: Vec<PathBuf> = fs::read_dir(directory_path)
166        .unwrap()
167        .filter_map(|e| {
168            let entry: DirEntry = e.ok()?;
169            if is_file(&entry) && is_not_hidden(&entry) {
170                Some(entry.path())
171            } else {
172                None
173            }
174        })
175        .collect::<Vec<PathBuf>>();
176    Ok(path_list)
177}
178
179/// # Tars and Gzips the specified folder
180///
181/// This creates a tar file at `tar_path`, compresses everything in `path_to_compress` and names it with `folder_name`
182fn tar_folder(
183    tar_path: &PathBuf,
184    path_to_compress: &PathBuf,
185    folder_name: &String,
186) -> Result<(), Error> {
187    let tar_gz: File = File::create(tar_path)?;
188    let enc: GzEncoder<File> = GzEncoder::new(tar_gz, Compression::default());
189    let mut tar: Builder<GzEncoder<File>> = Builder::new(enc);
190    tar.append_dir_all(folder_name, path_to_compress)?;
191    Ok(())
192}
193
194/// # Anonymise the contact fields and update the checksum
195fn update_json_file(
196    metadata_file_name: &PathBuf,
197    checksum: String,
198    json_value: &mut Value,
199) -> Result<(), Error> {
200    let tdr: &mut Value = &mut json_value["parameters"]["TDR"];
201    tdr["Contact-Email"] = json!("XXXXXXXXX");
202    tdr["Contact-Name"] = json!("XXXXXXXXX");
203    tdr["Document-Checksum-sha256"] = json!(checksum);
204    fs::write(metadata_file_name, json_value.to_string())
205}
206
207/// # Untar and unzip the input tar.gz file
208fn decompress_file(path_to_tar: &PathBuf, output_path: &PathBuf) -> Result<(), Error> {
209    let tar_gz: File = File::open(path_to_tar)?;
210    let tar: GzDecoder<File> = GzDecoder::new(tar_gz);
211    let mut archive: Archive<GzDecoder<File>> = Archive::new(tar);
212    archive.unpack(output_path)?;
213    Ok(())
214}
215
216/// # Read the metadata.json file and parse it into a serde `Value`
217fn parse_metadata_json(metadata_file_path: &PathBuf) -> Result<Value, Error> {
218    let mut metadata_file: File = File::open(metadata_file_path)?;
219    let mut metadata_json_as_string: String = String::new();
220    metadata_file.read_to_string(&mut metadata_json_as_string)?;
221    Ok(serde_json::from_str(&metadata_json_as_string)?)
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227    use crate::create_docx_with_checksum;
228    use assert_fs::TempDir;
229    use std::fs::{read_dir, read_to_string};
230    use testlib::create_package;
231
232    #[test]
233    fn test_create_docx_with_checksum() {
234        let output_path = TempDir::new().unwrap();
235        let mut json_value = json!({
236            "parameters": {
237                "PARSER": {
238                    "name" : "test-name"
239                },
240                "TRE": {
241                    "payload": {
242                        "filename": "test-file-name.docx"
243                    }
244                }
245            }
246        });
247        let docx_checksum =
248            create_docx_with_checksum(&output_path.to_owned(), &mut json_value).unwrap();
249        let output_files = read_dir(&output_path.to_owned()).unwrap();
250        let filename = &output_files.last().unwrap().unwrap().file_name();
251
252        assert_eq!(
253            filename.to_str().unwrap().to_string(),
254            "test-file-name.docx"
255        );
256        assert_eq!(
257            docx_checksum,
258            "a951e0d7f11d9d2fa8c9508ee4b25944bb5810364089fc33221b1ec038eefd37"
259        )
260    }
261
262    #[test]
263    fn test_create_docx_with_checksum_missing_metadata_filename() {
264        let output_path = TempDir::new().unwrap();
265        let mut json_value = json!({
266            "parameters": {
267                "PARSER": {
268                    "name" : "test-name"
269                }
270            }
271        });
272        let err = create_docx_with_checksum(&output_path.to_owned(), &mut json_value).unwrap_err();
273        assert_eq!(
274            err.to_string(),
275            "'filename' is missing from the metadata json"
276        )
277    }
278
279    #[test]
280    fn test_parse_metadata_json_parses_data_into_value() {
281        let output_dir = TempDir::new().unwrap();
282        let metadata_path = &output_dir.join(PathBuf::from("metadata.json"));
283        fs::write(&metadata_path, r#"{"a": "b"}"#.as_bytes()).unwrap();
284        let json = parse_metadata_json(&metadata_path).unwrap();
285        assert_eq!(&json["a"], "b")
286    }
287
288    #[test]
289    fn test_decompress_file() {
290        let input_dir = TempDir::new().unwrap();
291        let output_dir = TempDir::new().unwrap();
292        let tar_path = create_package(&input_dir, "{}", None);
293        decompress_file(&tar_path, &output_dir.to_owned()).unwrap();
294        assert!(output_dir
295            .join(PathBuf::from("TDR-2023/test.docx"))
296            .exists());
297        assert!(output_dir
298            .join(PathBuf::from("TDR-2023/TRE-TDR-2023-metadata.json"))
299            .exists());
300    }
301
302    #[test]
303    fn test_update_json_file() {
304        let output_dir = TempDir::new().unwrap();
305        let metadata_path = &output_dir.join(PathBuf::from("metadata.json"));
306        let mut json_value = json!({
307            "parameters": {
308                "TDR": {
309                    "Contact-Email" : "test-email",
310                    "Contact-Email2": "test-email-2",
311                    "TDR-Contact-Name": "tdr-contact-name",
312                    "Contact-Name" : "test-name",
313                    "Document-Checksum-sha256": "test-checksum"
314                }
315            }
316        });
317        update_json_file(&metadata_path, "abcde".to_owned(), &mut json_value).unwrap();
318        let metadata_json_string = read_to_string(&metadata_path).unwrap();
319        let expected_json = r#"{"parameters":{"TDR":{"Contact-Email":"XXXXXXXXX","Contact-Email2":"test-email-2","Contact-Name":"XXXXXXXXX","Document-Checksum-sha256":"abcde","TDR-Contact-Name":"tdr-contact-name"}}}"#;
320        assert_eq!(metadata_json_string, expected_json);
321    }
322
323    #[test]
324    fn test_tar_folder_creates_a_new_tar() {
325        let tar_dir = TempDir::new().unwrap();
326        let output_dir = TempDir::new().unwrap();
327        let folder_name: String = String::from("test_name");
328        let tar_file_path = tar_dir.join("test.tar.gz");
329        tar_folder(&tar_file_path, &output_dir.to_owned(), &folder_name).unwrap();
330
331        assert!(tar_file_path.exists());
332    }
333}