diff --git a/.github/workflows/ci_docs_publish.yml b/.github/workflows/ci_docs_publish.yml
new file mode 100644
index 0000000..09435f2
--- /dev/null
+++ b/.github/workflows/ci_docs_publish.yml
@@ -0,0 +1,49 @@
+name: Publish Documentation
+
+on:
+ push:
+ branches: main
+
+permissions:
+ contents: read
+ pages: write
+ id-token: write
+
+jobs:
+ deploy:
+ environment:
+ name: github-pages
+ url: ${{ steps.deployment.outputs.page_url }}
+ runs-on: ubuntu-24.04
+ steps:
+ - uses: actions/configure-pages@v5
+
+ - uses: actions/checkout@v5
+
+ - name: Install extra dependencies for a python install
+ run: |
+ sudo apt-get update
+ sudo apt -y install --no-install-recommends liblzma-dev libbz2-dev libreadline-dev
+
+ - name: Install asdf cli
+ uses: asdf-vm/actions/setup@b7bcd026f18772e44fe1026d729e1611cc435d47
+
+ - name: Install software through asdf
+ uses: asdf-vm/actions/install@b7bcd026f18772e44fe1026d729e1611cc435d47
+
+ - name: reshim asdf
+ run: asdf reshim
+
+ - name: ensure poetry using desired python version
+ run: poetry env use $(asdf which python)
+
+ - name: install docs requirements
+ run: |
+ poetry install --sync --no-interaction --with docs
+
+ - run: poetry run zensical build --clean
+ - uses: actions/upload-pages-artifact@v4
+ with:
+ path: site
+ - uses: actions/deploy-pages@v4
+ id: deployment
diff --git a/.github/workflows/ci_linting.yml b/.github/workflows/ci_linting.yml
index 1be9758..fc9d2bf 100644
--- a/.github/workflows/ci_linting.yml
+++ b/.github/workflows/ci_linting.yml
@@ -17,10 +17,10 @@ jobs:
sudo apt -y install --no-install-recommends liblzma-dev libbz2-dev libreadline-dev
- name: Install asdf cli
- uses: asdf-vm/actions/setup@v4
+ uses: asdf-vm/actions/setup@b7bcd026f18772e44fe1026d729e1611cc435d47
- name: Install software through asdf
- uses: asdf-vm/actions/install@v4
+ uses: asdf-vm/actions/install@b7bcd026f18772e44fe1026d729e1611cc435d47
- name: reshim asdf
run: asdf reshim
diff --git a/.github/workflows/ci_testing.yml b/.github/workflows/ci_testing.yml
index 232ffb7..ac25451 100644
--- a/.github/workflows/ci_testing.yml
+++ b/.github/workflows/ci_testing.yml
@@ -20,10 +20,10 @@ jobs:
sudo apt -y install --no-install-recommends liblzma-dev libbz2-dev libreadline-dev libxml2-utils
- name: Install asdf cli
- uses: asdf-vm/actions/setup@v4
+ uses: asdf-vm/actions/setup@b7bcd026f18772e44fe1026d729e1611cc435d47
- name: Install software through asdf
- uses: asdf-vm/actions/install@v4
+ uses: asdf-vm/actions/install@b7bcd026f18772e44fe1026d729e1611cc435d47
- name: reshim asdf
run: asdf reshim
diff --git a/docs/README.md b/docs/README.md
deleted file mode 100644
index fc0de4a..0000000
--- a/docs/README.md
+++ /dev/null
@@ -1,304 +0,0 @@
-The Data Validation Engine (DVE) is a configuration driven data validation library.
-
-There are 3 core steps within the DVE:
-
-1. [File transformation](./detailed_guidance/file_transformation.md) - Parsing files from their submitted format into a common format.
-2. [Data contract](./detailed_guidance/data_contract.md) - Validating the types that have been submitted and casting them.
-3. [Business rules](./detailed_guidance/business_rules.md) - Performing more complex validations such as comparisons between fields and tables.
-
-with a 4th step being important but more variable depending on platform and users:
-
-4. [Error reports](./detailed_guidance/feedback_messages.md) - Compiles the errors generated from the previous stages and presents them within an Excel report. However, this could be reconfigured to meet the needs of your users.
-
-Each of these steps produce a list of [Feedback message](details/Feedback%20message.md) objects which can be reported back to the user for them to fix any issues.
-
-DVE configuration can be instantiated from a json (dischema) file which might be structured like this:
-
-```json
-{
- "contract": {
- "cache_originals": true,
- "error_details": null,
- "types": {},
- "schemas": {},
- "datasets": {
- "CWTHeader": {
- "fields": {
- "version": {
- "description": null,
- "is_array": false,
- "callable": "constr",
- "constraints": {
- "regex": "\\d{1,2}\\.\\d{1,2}"
- }
- },
- "periodStartDate": {
- "description": null,
- "is_array": false,
- "callable": "conformatteddate",
- "constraints": {
- "date_format": "%Y-%m-%d"
- }
- }
- },
- "mandatory_fields": [
- "version",
- "periodStartDate"
- ],
- "reporting_fields": [],
- "key_field": null,
- "reader_config": {
- ".xml": {
- "reader": "XMLStreamReader",
- "kwargs": {
- "record_tag": "Header",
- "n_records_to_read": 1
- },
- "field_names": null
- }
- },
- "aliases": {}
- }
- }
- },
- "transformations": {
- "rule_stores": [],
- "reference_data": {},
- "parameters": {},
- "rules": [],
- "filters": [
- {
- "name": "version is at least 1.0",
- "entity": "CWTHeader",
- "expression": "version >= '1.0'",
- "failure_type": "submission",
- "failure_message": "version is not at least 1.0",
- "error_code": "CWT000101",
- "reporting_field": "version",
- "category": "Bad value"
- }
- ],
- "post_filter_rules": [],
- "complex_rules": []
- }
-}
-```
-"Contract" is where [Data Contract](./detailed_guidance/data_contract.md) and [File Transformation](./detailed_guidance/file_transformation.md) (in the reader configs) are configured, and (due to legacy naming) transformations are where [Business rules](./detailed_guidance/business_rules.md) are configured.
-
-## Quick start
-In the code example shared above we have a json file named `cwt_example.dischema.json` and an xml file with the following structure:
-
-```xml
-
-
-
- 1.1
- 2025-01-01
-
-
-```
-
-### Data contract
-We can see in `config.contract.datasets` that there is a `CWTHeader` entity declared. This entity has 2 fields, `version` and `periodStartDate`.
-
-`version` is declared to be a `constr` which is the constrained string type from the Pydantic library. Therefore, any keyword arguments `constr` can be passed as `constraints` here. In this case we are constraining it to a regex 1-2 digits, followed by a literal period followed by 1-2 digits. This should match an `max n2` data type.
-
-`periodStartDate` on the other hand is a `conformatteddate`, this type is one that's defined in the DVE library as a `domain_type` see [Domain types](./detailed_guidance/domain_types.md). The output of a `conformatteddate` is a date type.
-
-This means that after the data contract step the resulting data will have the types: `version::string` and `periodStartDate::date`.
-
-We can also see that the `CWTHeader` entity has both `version` and `periodStartDate` set as mandatory fields. That means that if they are missing from the file or the value is null an error will be created.
-
-### File transformation
-Within the `CWTHeader` entity we can see a `reader_config` object. This should have a key for every expected file extension that is being submitted for the given dataset. In this case just `".xml"`. We declare which reader is being used `XMLStreamReader` and any kwargs that get passed to it when it's instantiated. Stream reader expects a tag where the record exists in the file (`Header` in this case) and how many records to read. Stream reader is written to be able to quickly pull out singular records such as headers. it will stop parsing once it has hit the maximum number of records, which can save time compared to traversing the whole file.
-
-### Code
-Lets bring together those first 2 steps in code. We want to first parse the file into a spark dataframe with all string types, then apply data contract to the dataframe to get a typed dataframe.
-
-> **note in the version that comes from gitlab, the dve library is spread across a number of modules. We are looking to put this in a top level `dve` module**
-
-```python
-import os
-from pyspark.sql import SparkSession
-# The spark tools require the current active spark session
-from dve.core_engine.backends.implementations.spark.readers.xml import SparkXMLStreamReader
-# we're using the spark stream reader, this uses the xmlstream reader but outputs a dataframe
-from dve.core_engine.backends.implementations.spark.contract import SparkDataContract
-# Applies the data contract over a spark dataframe
-from dve.core_engine.configuration.v1 import V1EngineConfig
-# the engine configuration for the current DVE version
-from dve.core_engine.backends.utilities import stringify_model
-# this takes the types of the datacontract and converts them to strings with the same structure.
-```
-
-Here we have all the imports from DVE we need, the stream reader, data contract, configuration object and utility.
-
-we've also imported `os` so we can set some spark args to make sure [SparkXML](https://github.com/databricks/spark-xml) is included, and spark session which will be needed.
-
-```python
-os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join(
- [
- "--packages",
- "com.databricks:spark-xml_2.12:0.16.0",
- "pyspark-shell",
- ]
-)
-spark = SparkSession.builder.getOrCreate()
-
-config = V1EngineConfig.load("cwt_example.dischema.json")
-
-data_contract_config = config.get_contract_metadata()
-reader_configs = data_contract_config.reader_metadata
-
-readers = {"XMLStreamReader": SparkXMLStreamReader}
-
-# File transformation step here
-entities = {}
-for entity in data_contract_config.schemas:
- # get config based on file type you're parsing
- ext_config = reader_configs[entity][".xml"]
- reader = readers[ext_config.reader](**ext_config.parameters)
- df = reader.read_to_dataframe(
- "cwt_example.xml", entity, stringify_model(data_contract_config.schemas[entity])
- )
- entities[entity] = df
-
-# Data contract step here
-data_contract = SparkDataContract(spark_session=spark)
-entities, feedback_errors_uri, success = data_contract.apply_data_contract(
- entities, None, data_contract_config
-)
-```
-
-from the top down we
-- set some spark arguments to make sure we have spark-xml present
-- get a spark session
-- load the configuration
-- get the data contract config specifically
-**file transformation**
-- get the reader configurations from the data contract config
-- create a mapping of reader_names to their concrete class. (This allows us to refer to a more abstract name in the config and decide what backend we're using in the code)
-- create an empty entity dictionary
-- iterate over each of the entities defined in the config
-- get the reader configuration for the file type we're reading (xml in this case)
-- get the concrete reader and instantiate it with the parameters we set in the config
-- read the file with a stringified model, this maintains the structure of the datacontract but makes sure everything is kept as strings.
-- add the dataframe to the entities dictionary
-**data contract**
-- instatiate the SparkDataContract class with a spark session
-- apply the data contract to the dict of entities returning the entities in the correct types. any validation messages and a success bool
-### Business rules
-
-Now we have typed entities we can apply business rules to them. We need a step implementation. we'll import that from the spark rules backend.
-
-```python
-from dve.core_engine.backends.implementations.spark.rules import SparkStepImplementations
-
-business_rules = SparkStepImplementations(spark_session=spark)
-business_rule_config = config.get_rule_metadata()
-
-messages = business_rules.apply_rules(entities, business_rule_config)
-```
-
-There we go. Messages is a list of [Feedback message](./detailed_guidance/feedback_messages.md) for every failed rule.
-
-### Utilising the Pipeline objects to run the DVE
-Within the DVE package, we have also created the ability to build pipeline objects to help orchestrate the running of the DVE from start to finish. We currently have an implementation for `Spark` and `DuckDB`. These pipeline objects abstract some of the complexity described above and only requires you to supply a few objects to run the DVE from start (file transformation) to finish (error reports). These can be read in further detail [here](../src/pipeline/) and we have tests [here](../tests/test_pipeline/) to ensure they are working as expected. Furthermore, if you have a situation where maybe you only want to run the Data Contract, then you can utilise the pipeline objects in a way that only runs the specific stages that you want. Below will showcase an example where the full e2e pipeline is run and how you can trigger the stages that you want.
-
-> **note in the version that comes from gitlab, the dve library is spread across a number of modules. We are looking to put this in a top level `dve` module**
-
-```python
-# Imports for a spark setup
-from pyspark.sql import SparkSession
-
-from core_engine.backends.implementations.spark.auditing import SparkAuditingManager
-from pipeline.spark_pipeline import SparkDVEPipeline
-
-# Local Spark Setup
-os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join(
- [
- "--packages",
- "com.databricks:spark-xml_2.12:0.16.0",
- "pyspark-shell",
- ]
-)
-
-spark = SparkSession.builder.getOrCreate()
-
-# Setting up the audit manager
-audit_manager = SparkAuditingManager(
- database=spark_test_database,
- pool=ThreadPoolExecutor(1),
- spark=spark,
-)
-
-# Setting up the Pipeline (in this case the Spark implemented one)
-pipeline = SparkDVEPipeline(
- processed_files_path="path/where/my/processed_files/should_go/",
- audit_tables=audit_manager,
- job_run_id=1,
- rules_path="path/to/my_dischema",
- submitted_files_path="path/to/my/cwt_files/",
- reference_data_loader=SparkParquetRefDataLoader,
- spark=spark
-)
-```
-
-Once you have setup the Pipeline object, audit object and your environment - you are ready to use the pipeline in whatever way works for you. You can simply utilise the `cluster_pipeline_run` method which will run all the stages of dve (from file transformation to error reports) or you can run the stages that you specifically need. For instance...
-
-```python
-# this will run all stages of the dve
-dve_pipeline.cluster_pipeline_run(max_workers=2)
-```
-
-**OR**
-
-```python
-submitted_files = dve_pipeline._get_submission_files_for_run()
-submitted_file_infos = []
-
-for submission in submitted_files:
- submitted_file_infos.append(dve_pipeline.audit_received_file(sub_id, *subs))
-
-dve_pipeline.data_contract_step(
- pool=ThreadPoolExecutor(2),
- file_transform_results=submitted_file_infos
-)
-```
-
-For the Data Contract step you may have noticed that you will need to provide a list of `SubmissionInfo` objects. These are pydantic models which contain metadata for a given Submission. Within this example we are using the `_get_submission_files_for_run` method to get a tuple of URI's where the Submission URI and Metadata URI exist for a given submission. We then pass them through the `audit_received_file_step` method to audit the submissions and in return get a SubmissionInfo object that we can then utilise within the `data_contract_step` method.
-
-If you'd rather not rely on needing a `metadata.json` associated with your submitted files you can build your own bespoke process for building a list of `SubmissionInfo` objects.
-
-### Mixing backends
-
-The examples shown above are using the Spark Backend. DVE also has a DuckDB backend found at [core_engine.backends.implementations.duckdb](../src/core_engine/backends/implementations/duckdb/). In order to mix the two you will need to convert from one type of entity to the other. For example from a spark `Dataframe` to DuckDB `relation`. The easiest way to do this is to use the `write_parquet` method from one backend and use `read_parquet` from another backend.
-
-Currently the configuration isn't backend agnostic for applying business rules. So if you want to swap between spark and duckdb, the business rules need to be written using only features that are common to both backends. For example, a regex check in spark would be something along the lines of...
-```sql
-nhsnumber rlike '^\d{10}$'
-```
-...but in duckdb it would be...
-```sql
-regexp_matches(nhsnumber, '^\d{10}$')
-```
-Failures in parsing the expressions lead to failure messages such as
-```python
-FeedbackMessage(
- entity=None,
- record=None,
- failure_type='integrity',
- is_informational=False,
- error_type=None,
- error_location=None,
- error_message="Unexpected error (AnalysisException: Undefined function: 'regexp_matches'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 5) in transformations (rule: root; step: 0; id: None)",
- error_code=None,
- reporting_field=None,
- reporting_field_name=None,
- value=None,
- category=None
-)
-```
-
-# Extra information
-Thanks for reading the documentation and looking into utilising the DVE. If you need more information on any of the steps you can find the following guidance below. If you need additional support, please raise an issue ([see guidance here](../CONTRIBUTE.md)) and we will try and respond to you as quickly as possible.
diff --git a/docs/advanced_guidance/index.md b/docs/advanced_guidance/index.md
new file mode 100644
index 0000000..4ffaa8e
--- /dev/null
+++ b/docs/advanced_guidance/index.md
@@ -0,0 +1,25 @@
+---
+title: Advanced Guidance
+---
+
+
+
+- :material-file-code:{ .lg .middle } __DVE Code Reference Documentation__
+
+ ---
+
+ [:octicons-arrow-right-24: Read the code documentation here](package_documentation/)
+
+- :material-database-plus:{ .lg .middle } __Implementing a new backend__
+
+ ---
+
+ [:octicons-arrow-right-24: Setup a new backend here](new_backend.md)
+
+- :material-code-block-braces:{ .lg .middle } __Setting up a dischema language server__
+
+ ---
+
+ [:octicons-arrow-right-24: Setup your environment to make writing rules easier](json_schemas.md)
+
+
\ No newline at end of file
diff --git a/docs/advanced_guidance/json_schemas.md b/docs/advanced_guidance/json_schemas.md
new file mode 100644
index 0000000..5f24bd4
--- /dev/null
+++ b/docs/advanced_guidance/json_schemas.md
@@ -0,0 +1,35 @@
+# JSON Schemas
+
+JSON schemas define how the rules within the dischema document should be written. We also include components to help write the rulestore or ruleset documents as well.
+
+You can download a copy of the json schemas [here](https://github.com/NHSDigital/data-validation-engine/tree/main/docs/advanced_guidance/json_schemas).
+
+For autocomplete support in VS Code, you can alter the `.vscode/settings.json` and add new entries to the
+`json.schemas` key. If not present, simply copy & paste the code shown below into your `settings.json`:
+
+```json
+{
+ ...,
+ "json.schemas": [
+ {
+ "fileMatch": [
+ "*.dischema.json"
+ ],
+ "url": "./json_schemas/dataset.schema.json"
+ },
+ {
+ "fileMatch": [
+ "*.rulestore.json",
+ "*_ruleset.json"
+ ],
+ "url": "./json_schemas/rule_store.schema.json"
+ }
+ ]
+}
+```
+
+Your dischema will then have autocomplete and syntax suggestion support.
+
+# Components
+
+The DVE rules are built on a number of components. You can view the components [here](https://github.com/NHSDigital/data-validation-engine/tree/main/docs/advanced_guidance/json_schemas).
diff --git a/docs/json_schemas/contract/components/base_entity.schema.json b/docs/advanced_guidance/json_schemas/contract/components/base_entity.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/base_entity.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/base_entity.schema.json
diff --git a/docs/json_schemas/contract/components/contact_error_details.schema.json b/docs/advanced_guidance/json_schemas/contract/components/contact_error_details.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/contact_error_details.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/contact_error_details.schema.json
diff --git a/docs/json_schemas/contract/components/entity.schema.json b/docs/advanced_guidance/json_schemas/contract/components/entity.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/entity.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/entity.schema.json
diff --git a/docs/json_schemas/contract/components/field.schema.json b/docs/advanced_guidance/json_schemas/contract/components/field.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/field.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/field.schema.json
diff --git a/docs/json_schemas/contract/components/field_error_detail.schema.json b/docs/advanced_guidance/json_schemas/contract/components/field_error_detail.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/field_error_detail.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/field_error_detail.schema.json
diff --git a/docs/json_schemas/contract/components/field_error_type.schema copy.json b/docs/advanced_guidance/json_schemas/contract/components/field_error_type.schema copy.json
similarity index 100%
rename from docs/json_schemas/contract/components/field_error_type.schema copy.json
rename to docs/advanced_guidance/json_schemas/contract/components/field_error_type.schema copy.json
diff --git a/docs/json_schemas/contract/components/field_error_type.schema.json b/docs/advanced_guidance/json_schemas/contract/components/field_error_type.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/field_error_type.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/field_error_type.schema.json
diff --git a/docs/json_schemas/contract/components/field_specification.schema.json b/docs/advanced_guidance/json_schemas/contract/components/field_specification.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/field_specification.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/field_specification.schema.json
diff --git a/docs/json_schemas/contract/components/readable_entity.schema.json b/docs/advanced_guidance/json_schemas/contract/components/readable_entity.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/readable_entity.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/readable_entity.schema.json
diff --git a/docs/json_schemas/contract/components/type_name.schema.json b/docs/advanced_guidance/json_schemas/contract/components/type_name.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/type_name.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/type_name.schema.json
diff --git a/docs/json_schemas/contract/components/validation_function.schema.json b/docs/advanced_guidance/json_schemas/contract/components/validation_function.schema.json
similarity index 100%
rename from docs/json_schemas/contract/components/validation_function.schema.json
rename to docs/advanced_guidance/json_schemas/contract/components/validation_function.schema.json
diff --git a/docs/json_schemas/contract/contract.schema.json b/docs/advanced_guidance/json_schemas/contract/contract.schema.json
similarity index 100%
rename from docs/json_schemas/contract/contract.schema.json
rename to docs/advanced_guidance/json_schemas/contract/contract.schema.json
diff --git a/docs/json_schemas/dataset.schema.json b/docs/advanced_guidance/json_schemas/dataset.schema.json
similarity index 100%
rename from docs/json_schemas/dataset.schema.json
rename to docs/advanced_guidance/json_schemas/dataset.schema.json
diff --git a/docs/json_schemas/rule_store.schema.json b/docs/advanced_guidance/json_schemas/rule_store.schema.json
similarity index 100%
rename from docs/json_schemas/rule_store.schema.json
rename to docs/advanced_guidance/json_schemas/rule_store.schema.json
diff --git a/docs/json_schemas/transformations/components/business_filter.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/business_filter.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/business_filter.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/business_filter.schema.json
diff --git a/docs/json_schemas/transformations/components/business_rule.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/business_rule.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/business_rule.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/business_rule.schema.json
diff --git a/docs/json_schemas/transformations/components/concrete_filter.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/concrete_filter.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/concrete_filter.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/concrete_filter.schema.json
diff --git a/docs/json_schemas/transformations/components/core_filter.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/core_filter.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/core_filter.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/core_filter.schema.json
diff --git a/docs/json_schemas/transformations/components/filter.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/filter.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/filter.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/filter.schema.json
diff --git a/docs/json_schemas/transformations/components/multiple_expressions.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/multiple_expressions.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/multiple_expressions.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/multiple_expressions.schema.json
diff --git a/docs/json_schemas/transformations/components/rule.schema.json b/docs/advanced_guidance/json_schemas/transformations/components/rule.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/components/rule.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/components/rule.schema.json
diff --git a/docs/json_schemas/transformations/transformations.schema.json b/docs/advanced_guidance/json_schemas/transformations/transformations.schema.json
similarity index 100%
rename from docs/json_schemas/transformations/transformations.schema.json
rename to docs/advanced_guidance/json_schemas/transformations/transformations.schema.json
diff --git a/docs/advanced_guidance/new_backend.md b/docs/advanced_guidance/new_backend.md
new file mode 100644
index 0000000..1c63d00
--- /dev/null
+++ b/docs/advanced_guidance/new_backend.md
@@ -0,0 +1,2 @@
+!!! note
+ This section has not yet been written. Coming soon.
diff --git a/docs/advanced_guidance/package_documentation/auditing.md b/docs/advanced_guidance/package_documentation/auditing.md
new file mode 100644
index 0000000..a043e9a
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/auditing.md
@@ -0,0 +1,13 @@
+
+::: dve.core_engine.backends.base.auditing.BaseAuditingManager
+ options:
+ heading_level: 3
+ members:
+ - get_submission_info
+ - get_submission_statistics
+ - get_submission_status
+ - get_all_file_transformation_submissions
+ - get_all_data_contract_submissions
+ - get_all_business_rule_submissions
+ - get_all_error_report_submissions
+ - get_current_processing_info
diff --git a/docs/advanced_guidance/package_documentation/domain_types.md b/docs/advanced_guidance/package_documentation/domain_types.md
new file mode 100644
index 0000000..8fefe9b
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/domain_types.md
@@ -0,0 +1,6 @@
+
+::: dve.metadata_parser.domain_types
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/advanced_guidance/package_documentation/feedback_messages.md b/docs/advanced_guidance/package_documentation/feedback_messages.md
new file mode 100644
index 0000000..ede7aad
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/feedback_messages.md
@@ -0,0 +1,12 @@
+
+::: dve.core_engine.message.FeedbackMessage
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.core_engine.message.UserMessage
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/advanced_guidance/package_documentation/index.md b/docs/advanced_guidance/package_documentation/index.md
new file mode 100644
index 0000000..c2e76c5
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/index.md
@@ -0,0 +1,19 @@
+---
+title: Package Documentation
+---
+
+
+
+- :material-language-python:{ .lg .middle } __Pipeline__
+
+ ---
+
+ [:octicons-arrow-right-24: Read about the `Pipeline` objects here](pipeline.md)
+
+- :material-set-all:{ .lg .middle } __Reference Data Loader__
+
+ ---
+
+ [:octicons-arrow-right-24: Read about the `Reference Data Loader` objects here](refdata_loaders.md)
+
+
\ No newline at end of file
diff --git a/docs/advanced_guidance/package_documentation/models.md b/docs/advanced_guidance/package_documentation/models.md
new file mode 100644
index 0000000..42e9420
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/models.md
@@ -0,0 +1,6 @@
+
+::: dve.core_engine.models
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/advanced_guidance/package_documentation/operations.md b/docs/advanced_guidance/package_documentation/operations.md
new file mode 100644
index 0000000..8c3fb6e
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/operations.md
@@ -0,0 +1,6 @@
+
+::: dve.core_engine.backends.metadata.rules
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/advanced_guidance/package_documentation/pipeline.md b/docs/advanced_guidance/package_documentation/pipeline.md
new file mode 100644
index 0000000..bdea5a6
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/pipeline.md
@@ -0,0 +1,18 @@
+
+::: dve.pipeline.pipeline.BaseDVEPipeline
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.pipeline.duckdb_pipeline.DDBDVEPipeline
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.pipeline.spark_pipeline.SparkDVEPipeline
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/advanced_guidance/package_documentation/readers.md b/docs/advanced_guidance/package_documentation/readers.md
new file mode 100644
index 0000000..4f19571
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/readers.md
@@ -0,0 +1,87 @@
+## CSV
+
+=== "Base"
+
+ ::: dve.core_engine.backends.readers.csv.CSVFileReader
+ options:
+ heading_level: 3
+ merge_init_into_class: true
+ members: false
+
+=== "DuckDB"
+
+ ::: dve.core_engine.backends.implementations.duckdb.readers.csv.DuckDBCSVReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+ ::: dve.core_engine.backends.implementations.duckdb.readers.csv.PolarsToDuckDBCSVReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+ ::: dve.core_engine.backends.implementations.duckdb.readers.csv.DuckDBCSVRepeatingHeaderReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+=== "Spark"
+
+ ::: dve.core_engine.backends.implementations.spark.readers.csv.SparkCSVReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+## JSON
+
+=== "DuckDB"
+
+ ::: dve.core_engine.backends.implementations.duckdb.readers.json.DuckDBJSONReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+=== "Spark"
+
+ ::: dve.core_engine.backends.implementations.spark.readers.json.SparkJSONReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+## XML
+
+=== "Base"
+
+ ::: dve.core_engine.backends.readers.xml.BasicXMLFileReader
+ options:
+ heading_level: 3
+ merge_init_into_class: true
+ members: false
+
+=== "DuckDB"
+
+ ::: dve.core_engine.backends.implementations.duckdb.readers.xml.DuckDBXMLStreamReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+=== "Spark"
+
+ ::: dve.core_engine.backends.implementations.spark.readers.xml.SparkXMLStreamReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
+
+ ::: dve.core_engine.backends.implementations.spark.readers.xml.SparkXMLReader
+ options:
+ heading_level: 3
+ members:
+ - __init__
diff --git a/docs/advanced_guidance/package_documentation/refdata_loaders.md b/docs/advanced_guidance/package_documentation/refdata_loaders.md
new file mode 100644
index 0000000..8535fd5
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/refdata_loaders.md
@@ -0,0 +1,18 @@
+
+::: dve.core_engine.backends.base.reference_data.BaseRefDataLoader
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.core_engine.backends.implementations.duckdb.reference_data.DuckDBRefDataLoader
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.core_engine.backends.implementations.spark.reference_data.SparkRefDataLoader
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/advanced_guidance/package_documentation/refence_data_types.md b/docs/advanced_guidance/package_documentation/refence_data_types.md
new file mode 100644
index 0000000..c5ad0ad
--- /dev/null
+++ b/docs/advanced_guidance/package_documentation/refence_data_types.md
@@ -0,0 +1,18 @@
+
+::: dve.core_engine.backends.base.reference_data.ReferenceTable
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.core_engine.backends.base.reference_data.ReferenceFile
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
+
+::: dve.core_engine.backends.base.reference_data.ReferenceURI
+ handler: python
+ options:
+ show_root_heading: true
+ heading_level: 2
diff --git a/docs/assets/images/doc_images/error_data.png b/docs/assets/images/doc_images/error_data.png
new file mode 100644
index 0000000..c8fb193
Binary files /dev/null and b/docs/assets/images/doc_images/error_data.png differ
diff --git a/docs/assets/images/doc_images/error_summary.png b/docs/assets/images/doc_images/error_summary.png
new file mode 100644
index 0000000..d21d9b9
Binary files /dev/null and b/docs/assets/images/doc_images/error_summary.png differ
diff --git a/docs/assets/images/doc_images/summary_view.png b/docs/assets/images/doc_images/summary_view.png
new file mode 100644
index 0000000..b1ca282
Binary files /dev/null and b/docs/assets/images/doc_images/summary_view.png differ
diff --git a/docs/assets/images/favicon.ico b/docs/assets/images/favicon.ico
new file mode 100644
index 0000000..aff7490
Binary files /dev/null and b/docs/assets/images/favicon.ico differ
diff --git a/docs/assets/images/favicon.svg b/docs/assets/images/favicon.svg
new file mode 100644
index 0000000..cd21739
--- /dev/null
+++ b/docs/assets/images/favicon.svg
@@ -0,0 +1,4 @@
+
diff --git a/docs/assets/images/nhsuk-icon-180.png b/docs/assets/images/nhsuk-icon-180.png
new file mode 100644
index 0000000..4881d5e
Binary files /dev/null and b/docs/assets/images/nhsuk-icon-180.png differ
diff --git a/docs/assets/images/nhsuk-icon-192.png b/docs/assets/images/nhsuk-icon-192.png
new file mode 100644
index 0000000..459248f
Binary files /dev/null and b/docs/assets/images/nhsuk-icon-192.png differ
diff --git a/docs/assets/images/nhsuk-icon-512.png b/docs/assets/images/nhsuk-icon-512.png
new file mode 100644
index 0000000..a2697f4
Binary files /dev/null and b/docs/assets/images/nhsuk-icon-512.png differ
diff --git a/docs/assets/images/nhsuk-icon-mask.svg b/docs/assets/images/nhsuk-icon-mask.svg
new file mode 100644
index 0000000..970859e
--- /dev/null
+++ b/docs/assets/images/nhsuk-icon-mask.svg
@@ -0,0 +1,3 @@
+
diff --git a/docs/assets/images/nhsuk-opengraph-image.png b/docs/assets/images/nhsuk-opengraph-image.png
new file mode 100644
index 0000000..434b2d9
Binary files /dev/null and b/docs/assets/images/nhsuk-opengraph-image.png differ
diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css
new file mode 100644
index 0000000..945791d
--- /dev/null
+++ b/docs/assets/stylesheets/extra.css
@@ -0,0 +1,57 @@
+:root {
+ --nhs-blue: #005EB8;
+}
+
+.md-header__button.md-logo img {
+ transform: scale(1.8);
+ transform-origin: left center;
+}
+
+.md-footer-meta {
+ background-color: transparent !important;
+ box-shadow: none;
+}
+
+.md-footer-meta::before {
+ content: "";
+ display: block;
+ height: 3px;
+ background-color: var(--nhs-blue);
+}
+
+.md-footer-meta.md-typeset a {
+ margin: 0 0.1rem;
+}
+
+.md-footer-meta.md-typeset a svg {
+ transform: scale(1.5);
+ transform-origin: center;
+ transition: transform 0.2s ease, filter 0.2s ease;
+}
+
+.md-footer-meta.md-typeset a:hover svg {
+ transform: scale(1.75);
+ filter: brightness(1.2) drop-shadow(0 0 6px rgba(30, 136, 229, 0.7));
+}
+
+.md-footer-meta a {
+ text-decoration: underline;
+}
+
+/* Light mode */
+[data-md-color-scheme="default"] {
+ --md-primary-fg-color: var(--nhs-blue);
+ --md-footer-bg-color: var(--md-default-bg-color);
+ --md-footer-fg-color: #000000;
+ --md-footer-fg-color--light: #333333;
+ --md-footer-fg-color--lighter: #555555;
+}
+
+/* Dark mode */
+[data-md-color-scheme="slate"] {
+ --md-primary-fg-color: var(--nhs-blue);
+ --md-footer-bg-color: var(--md-default-bg-color);
+ --md-footer-fg-color: #e0e0e0;
+ --md-footer-fg-color--light: #bdbdbd;
+ --md-footer-fg-color--lighter: #9e9e9e;
+}
diff --git a/docs/detailed_guidance/business_rules.md b/docs/detailed_guidance/business_rules.md
deleted file mode 100644
index adb4e37..0000000
--- a/docs/detailed_guidance/business_rules.md
+++ /dev/null
@@ -1,363 +0,0 @@
-# Business Rules
-Business rules are defined in the `transformations` section of the config. There are 6 keys within the json document that we will discuss in more detail throughout this document.
-
-## Keys
-| Key | Purpose |
-| --- | ------- |
-| `parameters` | For setting globally available variables. |
-| `reference_data` | For bringing in reference data tables. |
-| `rules_store` | For referring to other configuration files that contain shared rules. |
-| `filters` | Simple rules that don't require much or any transformation. |
-| `complex_rules` | Series of transformations that end in a filter. Such as joining or aggregating before performing a check. |
-| `post_filter_rules` | For clearing down created entities that are no longer needed after validation. |
-
-## Filters
-These are the most simple of the business rules. These are defined as a json object with the following structure:
-```json
-{
-"entity": "APCActivity",
-"name": "EpiNo_is_valid",
-"expression": "EpiNo IS NULL OR EpiNo RLIKE '^(0[1-9]|[1-7][0-9]|8[0-7]|9[89])$'",
-"failure_type": "submission",
-"failure_message": "is invalid",
-"error_code": "1203",
-"reporting_field": "EpiNo",
-"is_informational" : false,
-"category": "Bad value"
-}
-```
-This rule checks that EpiNo must be present and that the value is 01-87 or 98 or 99. If EpiNo is missing this rule doesnt fire (to prevent double dinging a missing value). Any EpiNo that are present but not one of the values expected will raise a 1203 error with the message "is invalid".
-Lets break it down:
-| Key | Purpose |
-| --- | ------- |
-| `entity` | This is the name of the entity to perform the filter on. In this Case the `APCActivity` dataframe |
-| `name` | This should be a descriptive name for the rule. |
-| `expression` | The SQL expression that evaluates to a bool. Any row that evaluates to False will be filtered out. This is so that you can define the rules as they are written in the ETOS rather than inverting the conditions |
-| `failure_type` | The type of failure. There are three types of failures. Submission, record or integrity.
"submission" means the whole submission is invalidated by a failure in this rule and should be rejected (though this is for you to implement).
"record" means that this row of data is invalid, and will be excluded from the output.
"integrity" means that some constraint on the data has failed and no further processing can occur. This is normally raised when there is a parsing error in the expression but can be used to quickly reject data that doesn't meet a basic check
|
-| `failure_message` | The message you wish for the user to receive |
-| `error_code` | the code that links back to the specification. For example CHC had error codes like `CHC0010021` for the second field in the CHC001 tables first validation. This allows for collection of metrics for which rules have fired (again for you to implement) and allows the user to go back to the specification if the error message isn't clear enough |
-| `reporting_field` | This is the field to report back to the user as having failed. The expression could be more complex and be something like `if(NhsNumber is null, NHSStatus = '05', True)`, where is NHSNumber isn't null we short circuit the rule and everything else passes. otherwise the result of the expression is the `NHSStaus` being 05. In this case you may wish to have reporting field be `NHSStatus` and report back which status triggered the check. or `['NHSNumber', 'NHSStatus']` to report them both back. |
-| `is_informational` | This bool signals that this is a warning rather than an error
-| `category` | Optional literal. Used more in metrics to give an idea of how many things fail due to a values being wrong, formatting, nulls or file parsing. Below is an example of categorical error types...
"bad value" - The value(s) in the check were wrong
"wrong format" - The formatting of the field is incorrect
"blank" - the value is missing when it shouldn't be
"bad file" - usually used when the file fails to parse due to bad formatting
-
-## Parameters
-Parameters are globally available parameters that can be templated in to a rule using jinja2 syntax.
-
-Lets say we have an example that compares several fields against the start of the financial year.
-
-we could implement it like this:
-
-```json
-{
- "filters" : [
- {
- "entity": "APCActivity",
- "name": "StartDate_is_valid",
- "expression": "StartDate >= '2025-04-01'",
- "failure_type": "submission",
- "failure_message": "start date is before the start of the financial year",
- "error_code": "1203",
- "reporting_field": "StartDate",
- "is_informational" : false,
- "category": "Bad value"
- },
- {
- "entity": "APCActivity",
- "name": "EndDate_is_valid",
- "expression": "EndDate >= '2025-04-01'",
- "failure_type": "submission",
- "failure_message": "EndDate is before the start of the financial year",
- "error_code": "1203",
- "reporting_field": "EndDate",
- "is_informational" : false,
- "category": "Bad value"
- },
- ...
- ]
-}
-```
-This is fine for just 2 rules, but what if all dates need to be after the start of the financial year? What if a requirement comes in that it should be the 6th of april not the 1st of april?
-
-This is what parameters are for.
-
-```json
-{
- "parameters" : {
- "financial_year_start_date" : "'2025-04-01'"
- },
- "filters" : [
- {
- "entity": "APCActivity",
- "name": "StartDate_is_valid",
- "expression": "StartDate >= {{ financial_year_start_date }}",
- "failure_type": "submission",
- "failure_message": "start date is before the start of the financial year",
- "error_code": "1203",
- "reporting_field": "StartDate",
- "is_informational" : false,
- "category": "Bad value"
- },
- {
- "entity": "APCActivity",
- "name": "EndDate_is_valid",
- "expression": "EndDate >= {{ financial_year_start_date }}",
- "failure_type": "submission",
- "failure_message": "EndDate is before the start of the financial year",
- "error_code": "1204",
- "reporting_field": "EndDate",
- "is_informational" : false,
- "category": "Bad value"
- },
- ...
- ]
-}
-```
-Now we have the financial year start date parameterized. Any rule that needs to use it uses the same version. If we change the value in the parameter, all of the rules that use the parameter are updated too.
-
-These rules are quite repetitive. Using the same set up, similar error message. The only difference is the reporting field and error code in essence.
-
-## Complex rules
-
-Complex rules are pre-configured rules that can have multiple steps and accept parameters. These need to be defined in another file and then brought in using the rule store.
-
-The complex rule key in the main configuration refers to externally defined complex rules, and passes any parameters into them. So lets look at a simple rule, refactoring the example from above.
-
-> `complex_rules.rulestore.json`
-```json
-{
- "date_is_ge_financial_year" : {
- "description" : "checks the passed date is after or equal to the passed in date",
- "type" : "complex_rule",
- "parameter_descriptions" : {
- "error_code" : "code for the raised error",
- "financial_year_start_date" : "the date that the financial year starts",
- "field" : "the field to check",
- "entity" : "the entity the field exist on"
- },
- "parameter_defaults" : {},
- "rule_config": {
- "rules" : [],
- "filters" : [
- {
- "entity": "{{ entity }}",
- "name": "{{ field }}_is_valid",
- "expression": "{{ field }} >= {{ financial_year_start_date }}",
- "failure_type": "submission",
- "failure_message": "{{ field }} is before the start of the financial year",
- "error_code": "{{ error_code }}",
- "reporting_field": "{{ field }}",
- "is_informational" : false,
- "category": "Bad value"
- },
- ]
- }
- }
-}
-```
-Now that we have those rules define, we can use them in our regular configuration file.
-
-First we need to include the file in our rule_stores
->`example.json`
-```json
-{
- "parameters" : {
- "financial_year_start_date" : "'2025-04-01'"
- },
- "rule_stores": [
- {
- "store_type": "json",
- "filename": "complex_rules.dischema.json"
- },
- ],
- "filters" : [],
- "complex_rules" : [
- {
- "rule_name" : "date_is_ge_financial_year",
- "parameters" : {
- "field" : "StartDate",
- "error_code" : "1203",
- "entity" : "APCActivity",
- }
- },
- {
- "rule_name" : "date_is_ge_financial_year",
- "parameters" : {
- "field" : "EndDate",
- "error_code" : "1204",
- "entity" : "APCActivity",
- }
- },
- ...
- ]
-}
-```
-Note we've replaced the filters from the parameters section with complex rules. This requires that we pull in a rule_store. There are no limits to the number that can be included, and they can be shared across multiple versions of the specification. Now we have a rule that's defined once, and called multiple times with different parameters.
-
-> Note that `financial_year_start_date` isn't passed explicitly, that's because it's set as a parameter. Parameters are implicitly passed, you can be explicit if you prefer.
-
-## Rules
-We've covered adding filters to complex rules, but we can add rules to them aswell. This may be a bit of a misnomer, these are transformations on the data that get executed before filters. These operations include
-- select
-- takes an entity and performs a select for either adding new columns, removing columns.
-- remove
-- remove a given column
-- add column
-- adds a new column
-- group_by
-- perform an aggregation on an entity
-- filter_without_notifying
-- filter things without raising an error message, to do things like remove nulls before doing a regular filter
-- Joins
-- left
-- inner
-- anti_join
-- join to another table, any row that doesn't have a match in the other table will remain
-- more performant than doing a join then a null check
-- semi_join
-- join to another table, any row that doesn't have in the other table with be removed
-- join_header
-- joins a table with a single row onto every row. will raise an error if the header table has more than a single row.
-- used for things like checking submitting all dates in a file match the header
-- one_to_one_join
-- join to another entity expecting no change in the number of rows. integrity check can be toggled off
-> see [json_schemas/transformations](../json_schemas/transformations/) for expected fields for each operation
-
-Rules are executed in the order they are put into the array. So a join then select should be implemented in that order.
-
-```json
-{
- "rule_name" : {
- ...
- },
- "rules": [
- {
- "name": "Get CareId counts",
- "operation": "group_by",
- "entity": "{{ feed_type }}Activity",
- "new_entity_name": "{{ feed_type }}CareIdCounts",
- "group_by": "CareId",
- "agg_columns": {
- "COUNT(1)": "CareIdFreq"
- }
- },
- {
- "name": "Filter to keep only CareIds occuring more than once",
- "operation": "filter_without_notifying",
- "entity": "{{ feed_type }}CareIdCounts",
- "filter_rule": "CareIdFreq > 1"
- },
- {
- "name": "Inner join the activities onto the CareId counts",
- "operation": "inner_join",
- "entity": "{{ feed_type }}CareIdCounts",
- "target": "{{ feed_type }}Activity",
- "join_condition": "{{ feed_type }}CareIdCounts.CareId == {{ feed_type }}Activity.CareId",
- "new_columns": "{{ feed_type }}Activity.*"
- }
- ],
- "filters": [
- {
- "entity": "{{ feed_type }}CareIdCounts",
- "expression": "FALSE",
- "failure_type": "submission",
- "failure_message": "cannot be duplicate",
- "error_code": "1500",
- "reporting_entity": "{{ feed_type }}Activity",
- "reporting_field": "CareId",
- "category": "Bad value"
- }
- ],
- "post_filter_rules": [
- {
- "name": "Remove temporary entities",
- "operation": "remove_entity",
- "entity": "{{ feed_type }}CareIdCounts"
- }
- ],
- "dependencies" : []
-}
-```
-Above is an example taken from a [PLICS](https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/patient-level-information-and-costing-system-plics-data-collections) rule. We start with a `group_by`, that creates a new entity.
-
-> ⚠️ If you don't set a new entity, it will override the entity that's been used ⚠️
-
-We can then filter out any that don't have a count greater than 1. We then join back the activity date so it can be included in the error.
-
-The filter acts on the newly created entity, and since we've already filtered out any that's less than 1, all of the remaining are failures. So we raise a 1500 error for all of them.
-
-Then post-filter rules runs, and clears up the created entity.
-
-Finally we see the `dependencies` key, this is a list of rule names that this rule depends on. In this case it doesn't have any dependencies. But lets say we wanted to explode out an array, and that exploded version is used for many rules. We can make that explode a rule without any filters. Then other rules can depend on it.
-
-> Any dependencies need to be included in the complex rules of the `dischema.json` file
-
-## Reference data
-
-Reference data can be included, it's on object that takes the name you want to refer to the data as a key and the specification as a value:
-
-```json
-{
- "reference_data": {
- "allowed_submitters": {
- "type": "table",
- "database": "dve",
- "table_name": "refdata_plics_organisation_submitting_id"
- },
- "collection_activity": {
- "type": "table",
- "database": "dve",
- "table_name": "refdata_plics_int_collection_activity"
- },
- "collection_resource": {
- "type": "table",
- "database": "dve",
- "table_name": "refdata_plics_int_collection_resource"
- }
- }
-}
-```
-This allows us to refer to `refdata_plics_organisation_submitting_id` as `allowed_submitters` when we do things like anti-joins to it. The type is a table, it's in the dve database and the table name is `refdata_plics_organisation_submitting_id`. If we use the `SparkRefDataLoader` from `core_engine/backends/implementations/spark/reference_data.py` as our loader then this will lazily include the tables when they are used. There are other ways to specify reference data than just database objects - we can also specify relative file paths (from the location of the dischema location) or absolute uris.
-
-When using reference data we recommend using the `EntityManager` class, this prevents reference data from being mutated.
-
-an example in code for the parquet reader would be...
-```python
-ref_data_config = config.get_reference_data_config()
-rules = config.get_rule_metadata()
-SparkRefDataLoader.spark = spark
-SparkRefDataLoader.dataset_config_uri = "/path/to/folder/containing/dischema"
-
-ref_data = SparkRefDataLoader(
- ref_data_config,
-)
-
-entities = {...}
-entity_manager = EntityManager(entities, ref_data)
-
-business_rules.apply_rules(entity_manager, rules)
-```
-For the table loader it would be...
-```python
-ref_data_config = config.get_reference_data_config()
-rules = config.get_rule_metadata()
-ref_data = SparkTableRefDataLoader(ref_data_config)
-
-entities = {...}
-entity_manager = EntityManager(entities, ref_data)
-
-business_rules.apply_rules(entity_manager, rules)
-```
-
-...This can then be used in rules for refdata comparison:
-
-```json
-{
- "name": "Get the activities violating 1029",
- "operation": "anti_join",
- "entity": "{{ feed_type }}Activity",
- "target": "refdata_allowed_submitters",
- "join_condition": "{{ feed_type }}Activity.OrgId <=> refdata_allowed_submitters.Org_ID",
- "new_entity_name": "{{ feed_type }}1029Violators"
-}
-```
-> Note the prefix `refdata_` acts as an alias and allows for explicit join between entities.
diff --git a/docs/detailed_guidance/data_contract.md b/docs/detailed_guidance/data_contract.md
deleted file mode 100644
index 5be63d3..0000000
--- a/docs/detailed_guidance/data_contract.md
+++ /dev/null
@@ -1,315 +0,0 @@
-Lets look at the data contract configuration from [Introduction to DVE](../README.md) more closely, with a few more fields added:
-
-```json
-{
- "contract": {
- "cache_originals": true,
- "error_details": null,
- "types": {},
- "schemas": {},
- "datasets": {
- "CWTHeader": {
- "fields": {
- "version": {
- "description": null,
- "is_array": false,
- "callable": "constr",
- "constraints": {
- "regex": "\\d{1,2}\\.\\d{1,2}"
- }
- },
- "periodStartDate": {
- "description": null,
- "is_array": false,
- "callable": "conformatteddate",
- "constraints": {
- "date_format": "%Y-%m-%d"
- }
- },
- "periodEndDate": {
- "description": null,
- "is_array": false,
- "callable": "conformatteddate",
- "constraints": {
- "date_format": "%Y-%m-%d"
- }
- },
- },
- "mandatory_fields": [
- "version",
- "periodStartDate",
- "periodEndDate"
- ],
- "reporting_fields": [],
- "key_field": null,
- "reader_config": {
- ".xml": {
- "reader": "XMLStreamReader",
- "kwargs": {
- "record_tag": "Header",
- "n_records_to_read": 1
- },
- "field_names": null
- }
- },
- "aliases": {}
- },
- "CWTActivity": {
- "fields": {
- "activityStartDate":{
- "is_array": false,
- "callable": "conformatteddate",
- "constraints": {
- "date_format": "%Y-%m-%d"
- }
- }
- }
- }
- }
- }
-}
-```
-
-### Types
-
-Here we have only filled out datasets. We've added a few more fields such as `PeriodEndDate` and `activityStartDate` and we're starting to see a fair amount of duplication. Lets refactor this to remove that. For this we use `types`. This allows us to pre-configure a type and re-use it across the different datasets.
-
-```json
-{
- "contract": {
- "cache_originals": true,
- "error_details": null,
- "types": {
- "isodate": {
- "description": "an isoformatted date type",
- "callable": "conformatteddate",
- "constraints": {
- "date_format": "%Y-%m-%d"
- }
- }
- },
- "schemas": {},
- "datasets": {
- "CWTHeader": {
- "fields": {
- "version": {
- "description": null,
- "is_array": false,
- "callable": "constr",
- "constraints": {
- "regex": "\\d{1,2}\\.\\d{1,2}"
- }
- },
- "periodStartDate": "isodate",
- "periodEndDate": "isodate"
- },
- "mandatory_fields": [
- "version",
- "periodStartDate",
- "periodEndDate"
- ],
- "reporting_fields": [],
- "key_field": null,
- "reader_config": {
- ".xml": {
- "reader": "XMLStreamReader",
- "kwargs": {
- "record_tag": "Header",
- "n_records_to_read": 1
- }
- }
- },
- "aliases": {}
- },
- "CWTActivity": {
- "fields": {
- "activityStartDate": "isodate"
- },
- "reader_config": {
- ".xml": {
- "reader": "SparkXMLReader",
- "kwargs": {
- "record_tag": "Activity"
- }
- }
- }
- }
- }
- }
-}
-```
-
-Now we've added an `isodate` type in the `types` object. We can now use this pre-configured type elsewhere.
-
-### Schemas
-
-Schemas are used when a dataset has another nested dataset within. An example in XML would be:
-
-```xml
-
- 2025-01-02
- 2025-01-31
- 1111111111
- 01
-
- somecode
- 100
-
- abcd
- 10.10
-
-
- defg
- 20.20
-
-
-
-```
-
-We can see here that the Activity has a number of fields. `startdate`, `enddate` etc. However, `CstActivity` has its own fields. Including `resource` which has it's own fields. This is a use case for Schemas.
-
-```json
-{
- "contract": {
- "cache_originals": true,
- "error_details": null,
- "types": {
- "isodate": {
- "description": "an isoformatted date type",
- "callable": "conformatteddate",
- "constraints": {
- "date_format": "%Y-%m-%d"
- }
- }
- },
- "schemas": {
- "resource": {
- "fields": {
- "resource_id": "str",
- "cost": {
- "callable": "condecimal",
- "constraints": {
- "max_digits": 18,
- "decimal_places": 8
- }
- }
- },
- "mandatory_fields": [
- "cost",
- "resource_id"
- ]
- },
- "CstActivity": {
- "fields": {
- "cstCode": "str",
- "number": "int",
- "resource": {
- "model": "resource",
- "is_array": true
- }
- }
- }
- },
- "datasets": {
- "CWTActivity": {
- "fields": {
- "startdate": "isodate",
- "enddate": "isodate",
- "nhsnumber": "str",
- "nationalcode": "str",
- "CstActivity": {
- "model": "CstActivity",
- "is_array": true
- }
- },
- "reader_config": {
- ".xml": {
- "reader": "SparkXMLReader",
- "kwargs": {
- "record_tag": "Activity"
- }
- }
- }
- }
- }
- }
-}
-```
-
-There's a lot going on here. We've set the `CstActivity` to a `model` and set the `is_array` parameter to `true`. This builds it as an array of that model.
-
-The same is true for resource in `CstActivity`. In Spark this would create a schema that has an array of structs of `CstActivities` with an array of Structs of Resources.
-
-You can define as many schemas as you need to model your domain. This is particularly useful when the nested schemas don't have linkage IDs, so they can't be parsed as separate entities because the hierarchy would be lost.
-
-Schemas can have `mandatory_fields` but don't require reader configurations.
-
-### Field types
-
-Fields can have a type defined as a string, either a base type like `date`, `str`, a [Domain type](Domain%20types.md), or a defined [type](#types):
-
-```json
-{
- "startdate" : "date",
- "enddate" : "isodate",
- "numberofactivities": "NonNegativeInt"
-}
-```
-
-If the type is an array then it needs to be defined as an object rather than short hand with just a string.
-
-```json
-{
- "startdates" : {
- "type" : "date",
- "is_array" : true
- }
-}
-```
-
-It can be a model type defined in [schemas](#schemas), which can be also be an array or not.
-
-```json
-{
- "schemas" : {
- "APCCstActivity" : {
- "fields" : {
- ...
- }
- }
- },
- ...
- {
- "CstActivity" : {
- "model" : "APCCstActivity"
- },
- "Resources" : {
- "model" : "APCResources",
- "is_array" : true
- }
- }
-}
-```
-
-Finally callables. These are functions that return a type. Like `constr` from pydantic or `conformatteddate` in DVE [Domain types](Domain%20types.md). Any keyword arguments that go to these callables are passed in as `constraints`.
-
-```json
-{
- "ID": {
- "callable" : "constr",
- "constraints" : {
- "min_length" : 5,
- "max_length": 20,
- "regex" : "^ABC\w+"
- }
- },
- "nhsnumber" : {
- "callable" : "permissive_nhs_number",
- "constraints" : {
- "warn_on_test_numbers" : true
- }
- }
-}
-```
-
-In the example above, I've defined an `ID` field that is a constrained string type that should be between 5 and 20 characters in length and start with `ABC`. Additionally, I have also defined an `nhsnumber` field that raises warning when a test number is submitted (palindromes, or starts with 9).
diff --git a/docs/detailed_guidance/domain_types.md b/docs/detailed_guidance/domain_types.md
deleted file mode 100644
index 62279b1..0000000
--- a/docs/detailed_guidance/domain_types.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Domain Types
-
-Domain types are custom defined pydantic types that solve common problems with usual datasets or schemas defined in [Data contract](./data_contract.md).
-This might include Postcodes, NHS Numbers, dates with specific formats etc.
-
-Below is a list of defined types, their output type and any contraints. Nested beneath them are any constraints that area allowed and their default values if there are any.
-| Defined Type | Output Type | Contraints & Defaults | Supported Implementations |
-| ------------ | ----------- | --------------------- | ------------------------- |
-| NHSNumber | str | | Spark, DuckDB |
-| permissive_nhs_number | str |
| Spark, DuckDB |
-
-**Other types that are allowed include:**
-- str
-- int
-- date
-- datetime
-- Decimal
-- float
-- Any types that are included in [pydantic version 1.10](https://docs.pydantic.dev/1.10/usage/types/#pydantic-types)
diff --git a/docs/detailed_guidance/feedback_messages.md b/docs/detailed_guidance/feedback_messages.md
deleted file mode 100644
index 56ac983..0000000
--- a/docs/detailed_guidance/feedback_messages.md
+++ /dev/null
@@ -1 +0,0 @@
-WIP - it's a class in [DVE/core_engine/message.py](../../src/core_engine/message.py).
\ No newline at end of file
diff --git a/docs/detailed_guidance/file_transformation.md b/docs/detailed_guidance/file_transformation.md
deleted file mode 100644
index f83afe0..0000000
--- a/docs/detailed_guidance/file_transformation.md
+++ /dev/null
@@ -1 +0,0 @@
-WIP - See reader config in into
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
new file mode 100644
index 0000000..ba827ba
--- /dev/null
+++ b/docs/index.md
@@ -0,0 +1,36 @@
+---
+title: Data Validation Engine
+tags:
+ - Introduction
+ - File Transformation
+ - Data Contract
+ - Business Rules
+ - Spark
+ - DuckDB
+---
+
+# Data Validation Engine
+
+The Data Validation Engine (DVE) is a configuration driven data validation library written in [Python](https://www.python.org/), [Pydantic](https://docs.pydantic.dev/latest/) and a SQL backend currently consisting of [DuckDB](https://duckdb.org/) or [Spark](https://spark.apache.org/sql/). The configuration to run validations against a dataset are defined and written in a json document, which we will be referring to as the "dischema". The rules written within the dischema are designed to be run against all incoming data in a given submission - as this allows the DVE to capture all possible issues with the data without the submitter having to resubmit the same data repeatedly which is burdensome and time consuming for both the submitter and receiver of the data. Additionally, the rules can be configured to have the following behaviour:
+
+- **File Rejection** - The entire submission will be rejected if the given rule triggers one or more times.
+- **Row Rejection** - The row that triggered the rule will be rejected. Rows that pass the validation will be flowed through into a validated entity.
+- **Warning** - The rule will still trigger and be listed as a feedback message, but the record will still flow through into the validated entity.
+
+Certain scenarios prevent all validations from being executed. For more details, see the [File Transformation](user_guidance/file_transformation.md) section.
+
+The DVE has 3 core components:
+
+1. [File Transformation](user_guidance/file_transformation.md) - Parsing submitted files into a "stringified" (all fields casted to string) parquet format.
+
+2. [Data Contract](user_guidance/data_contract.md) - Validates submitted data against a specified datatypes and casts successful records to those types. Additionally providing modelling of your data as well.
+
+3. [Business rules](user_guidance/business_rules.md) - Performs simple and complex validations such as comparisons between fields, entities and/or lookups against reference data.
+
+For each component listed above, a [feedback message](user_guidance/feedback_messages.md) is generated whenever a rule is violated. These [feedback messages](user_guidance/feedback_messages.md) can be integrated directly into your system given you can consume `JSONL` files. Alternatively, we offer a fourth component called the [Error Reports](user_guidance/error_reports.md). This component will load the [feedback messages](user_guidance/feedback_messages.md) into an `.xlsx` (Excel) file which could be sent back to the submitter of the data. The excel file is compatible with services that offer spreadsheet reading such as [Microsoft Excel](https://www.microsoft.com/en/microsoft-365/excel), [Google Docs](https://docs.google.com/), [Libre Office Calc](https://www.libreoffice.org/discover/calc/) etc.
+
+DVE currently comes with two supported backend implementations. These are [DuckDB](user_guidance/implementations/duckdb.md) and [Spark](user_guidance/implementations/spark.md). If you to need a write a custom backend implementation, you may want to look at the [Advanced User Guidance](advanced_guidance/new_backend.md) section.
+
+Feel free to use the Table of Contents on the left hand side of the page to navigate to sections of interest or to use the "Next" and "Previous" buttons at the bottom of each page if you want to read through each page in sequential order.
+
+If you have questions or need additional support with the DVE, then please raise an issue on our GitHub page [here](https://github.com/NHSDigital/data-validation-engine/issues).
diff --git a/docs/json_schemas/README.md b/docs/json_schemas/README.md
deleted file mode 100644
index 10d96e1..0000000
--- a/docs/json_schemas/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# JSON Schemas
-
-These JSON schemas define the
-
-For autocomplete support in VS Code, alter `settings.json` and add new entries to the
-`json.schemas` array (or create this value if it's missing)
-
-```json
-{
- ...,
- "json.schemas": [
- {
- "fileMatch": [
- "*.dischema.json"
- ],
- "url": "./json_schemas/dataset.schema.json"
- },
- {
- "fileMatch": [
- "*.rulestore.json",
- "*_ruleset.json"
- ],
- "url": "./json_schemas/rule_store.schema.json"
- }
- ]
-}
-```
-
-Data Ingest JSON schemas (when saved with file_name `dataset.dischema.json`) should then have
-autocomplete support.
diff --git a/docs/user_guidance/auditing.md b/docs/user_guidance/auditing.md
new file mode 100644
index 0000000..445dbaf
--- /dev/null
+++ b/docs/user_guidance/auditing.md
@@ -0,0 +1,23 @@
+---
+tags:
+ - Auditing
+---
+
+The Auditing objects within the DVE are used to help control and store information about submitted data and what stage it's currently at. In addition to the above, it's also used to store statistics about the submission and the number of validations it has triggered etc. So, for users not interested in using the Error reports stage, you could source information directly from the audit tables.
+
+## Audit Tables
+
+Currently, these are the audit tables that can be accessed within the DVE:
+
+| Table Name | Purpose | When Available |
+| ----------------------- | ------- | -------------- |
+| `processing_status` | Contains information about the submission and what the current processing status is. | >= File Transformation |
+| `submission_info` | Contains information about the submitted file. | >= File Transformation |
+| `submission_statistics` | Contains validation statistics for each submission. | >= Error Reports |
+| `aggregates` | Contains aggregate counts of errors triggered for a submission | >= Error Reports |
+
+## Audit Objects
+
+You can use the the following methods to help you interact with the tables above or you can query the table via `sql`.
+
+You can read more about how to interact with the Audit Objects [here](../advanced_guidance/package_documentation/auditing.md).
diff --git a/docs/user_guidance/business_rules.md b/docs/user_guidance/business_rules.md
new file mode 100644
index 0000000..e7bd8b6
--- /dev/null
+++ b/docs/user_guidance/business_rules.md
@@ -0,0 +1,594 @@
+---
+title: Business Rules
+tags:
+ - Business Rules
+ - dischema
+ - Rule Store
+ - Reference Data
+---
+
+The Business Rules section contain the rules you want to apply to your dataset. Rule logic might include...
+
+- Checking if two or more fields are equivalent
+- Aggregating data to check if it matches a given value
+- Joining against other entities to compare values
+
+All rules are written in `SQL`. Depending on which [backend implementation](./implementations/) you have choosen, the syntax might be different between implementations.
+
+When writing the rules, you need to be aware that the expressions are negated (wrapped in a `NOT` expression). So, you should write the rules as though you are looking for non problematic values.
+
+When rules are being applied, [Complex Rules](./business_rules.md#complex-rules) are always applied before [Rules](./business_rules.md#rules) and [Filters](./business_rules.md#filters).
+
+This page is meant to give you greater details on how you can write your Business Rules. If you want a summary of how the Business Rules work, then please refer to the [Getting Started](./getting_started.md#rules-configuration-introduction) page.
+
+## Filters
+
+For the simplest rules, you can write them in the filters section. For example, if you had a movies dataset where you wanted to check the length of the movie had a realistic duration then you could write a rule like this...
+
+
+=== "Record Rejection"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "filters": [
+ {
+ "entity": "movies",
+ "name": "Ensure movie is less than 4 hours long",
+ "expression": "duration_minutes < 240",
+ "failure_type": "record",
+ "error_code": "MOVIE_TOO_LONG",
+ "failure_message": "Movie must be less than 4 hours long.",
+ "category": "Bad Value"
+ }
+ ]
+ }
+ }
+ ```
+
+=== "File Rejection"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "filters": [
+ {
+ "entity": "movies",
+ "name": "Ensure movie is less than 4 hours long",
+ "expression": "duration_minutes < 240",
+ "failure_type": "submission",
+ "error_code": "MOVIE_TOO_LONG",
+ "failure_message": "Movie must be less than 4 hours long.",
+ "category": "Bad Value"
+ }
+ ]
+ }
+ }
+ ```
+
+=== "Warning"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "filters": [
+ {
+ "entity": "movies",
+ "name": "Ensure movie is less than 4 hours long",
+ "expression": "duration_minutes < 240",
+ "failure_type": "record",
+ "is_informational": true,
+ "error_code": "MOVIE_TOO_LONG",
+ "failure_message": "Movie must be less than 4 hours long.",
+ "category": "Bad Value",
+ }
+ ]
+ }
+ }
+ ```
+
+The rule above can be written directly into the filters section because we do not need to perform any complex pre-step(s) such as filtering, aggregation(s), join(s) etc. We can simply select the fields of interest and perform the check.
+
+If you need to perform more complex rules, with pre-steps, then see the [Complex Rules](./business_rules.md#complex-rules) section further down this page.
+
+
+
+### Types of rejections
+
+You may have noticed the field "failure_type" in the example above. For any given rule (filter) you can reject a record, the whole file (submission) or just raise a warning. Here are the details around the currently supported Rejection Types:
+
+| Rejection Type | Behaviour | How to set in the rule |
+| -------------- | --------- | ---------------------- |
+| `submission` | Rejects the entire file. Even if it triggers once, no data will be projected in the final asset. | Set `failure_type` to `submission` |
+| `record` | Rejects the record that failed the check. Any records that fail will not be projected in the final asset | Set `failure_type` to `record` |
+| `warning` | Raises a warning that the record failed the check. This has no impact on whether the file/record is rejected. | Set `is_informational` to `true` |
+
+## Rules
+
+The `rules` section allows you to perform pre-steps to entities. For example, if you wanted to derive a new column, apply filters, aggregations, joins etc.
+
+With pre-steps, you can either modify an existing `entity` or create a new entity from an existing one. For example, here is a pre-step showing both modifying an existing entity or creating a new one:
+
+=== "Existing Entity"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "rules": [
+ {
+ "name": "add duration_hours as a new column",
+ "operation": "add",
+ "entity": "movies",
+ "column_name": "duration_hours",
+ "expression": "(duration_minutes / 60)"
+ }
+ ]
+ }
+ }
+ ```
+
+=== "New Entity"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "rules": [
+ {
+ "name": "add duration_hours as a new column",
+ "operation": "add",
+ "entity": "movies",
+ "column_name": "duration_hours",
+ "expression": "(duration_minutes / 60)",
+ "new_entity_name": "movies_modified"
+ }
+ ]
+ }
+ }
+ ```
+
+The difference between modifiying the existing entity and adding a new one is simply adding `"new_entity_name": ""`.
+
+!!! warning
+
+ When adding new columns to an existing entity these will be projected in the final entity. This might be something that you want and have intended (derived fields) but if not, you will need to write [post rule logic](./business_rules.md#post-rule) section to remove the column.
+
+### Operations
+
+For a full list of operations that you can perform during the pre-steps see [Advanced User Guidance: Operations](../advanced_guidance/package_documentation/operations.md).
+
+## Post Rule
+
+When a Business Rule has been finished, "post step rules" can be run. This is useful in situtations where you've created lots of new entities *or* you have added lots of new columns to existing entities.
+
+For new entities, you may not want to persist these in final outputs. If this is the case, then you can add post rules to remove the entity entirely or just a column in any existing entity (other than refdata entities). The code snippets below showcases how you can remove columns and new entities:
+
+=== "New Column Removal"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "rules": [
+ {
+ "name": "add duration_hours as a new column",
+ "operation": "add",
+ "entity": "movies",
+ "column_name": "duration_hours",
+ "expression": "(duration_minutes / 60)",
+ }
+ ],
+ "filters": [
+ ...
+ ],
+ "post_filter_rules": [
+ {
+ "operation": "remove",
+ "entity": "movies",
+ "column_name": "duration_hours"
+ }
+ ]
+ }
+ }
+ ```
+
+
+=== "New Entity Removal"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ "datasets": {
+ "movies": {
+ "fields": {
+ "duration_minutes": "int",
+ ...
+ },
+ ...
+ }
+ }
+ },
+ "transformations": {
+ "rules": [
+ {
+ "name": "add duration_hours as a new column",
+ "operation": "add",
+ "entity": "movies",
+ "column_name": "duration_hours",
+ "expression": "(duration_minutes / 60)",
+ "new_entity_name": "movies_modified"
+ }
+ ],
+ "filters": [
+ ...
+ ],
+ "post_filter_rules": [
+ {
+ "operation": "remove_entity",
+ "entity": "movies_modified"
+ }
+ ]
+ }
+ }
+ ```
+
+
+## Reference Data
+
+If your Business Rules are reliant on reference data, then you can add the `"reference_data"` key to the `"transformations"` section. The snippet below shows various formats of reference data that you might want to add:
+
+=== "Parquet source"
+
+ ```json title="movies.dischema.json"
+ {
+ "transformations": {
+ "reference_data": {
+ "movie_genre_lookup":{
+ "type": "filename",
+ "filename": "path/to/my/movie_genre_lookup.parquet"
+ },
+ ...
+ }
+ }
+ }
+ ```
+
+=== "Arrow source"
+
+ ```json title="movies.dischema.json"
+ {
+ "transformations": {
+ "reference_data": {
+ "movie_genre_lookup":{
+ "type": "filename",
+ "filename": "path/to/my/movie_genre_lookup.arrow"
+ },
+ ...
+ }
+ }
+ }
+ ```
+
+=== "Database source"
+
+ ```json title="movies.dischema.json"
+ {
+ "transformations": {
+ "reference_data": {
+ "movie_genre_lookup": {
+ "type": "table",
+ "database": "my_database",
+ "table_name": "movie_genre_lookup"
+ },
+ ...
+ }
+ }
+ }
+ ```
+
+!!! note
+
+ - When a new reference data entity is created, it will always be prefixed with `refdata_`
+
+ !!! warning
+
+ - Refdata entities are also immutable. So, if you need to modify them in any way, you will always need to create a new entity from it
+
+For latest supported reference data types, see [Advanced User Guidance: Reference Data Types](../advanced_guidance/package_documentation/refence_data_types.md).
+
+## Complex Rules
+
+Complex Rules are recommended when you need to perform a number of "pre-step" operations before you can apply a business rule (filter). For instance, if you needed to add a column, filter and then join you would need to add all these steps into your [Rules](./business_rules.md#rules) section. This might be ok, if you only need a small number of pre-steps or only have a couple of rules. However, when you have lots of rules and more than 1 have a number of operations required, it's best to place these into a [Rulestore](./business_rules.md#rule-stores) and reference them within the complex rules. Rules Stores also have other benefits that you can read [here](./business_rules.md#rule-stores).
+
+Here is an example of defining a complex rule:
+
+=== "dischema"
+
+ ```json title="movies.dischema.json"
+ {
+ "transformations": {
+ "parameters": {"entity": "movies"},
+ "reference_data": {
+ "sequels": {
+ "type": "table",
+ "database": "movies_refdata",
+ "table_name": "sequels"
+ }
+ },
+ "complex_rules": [
+ {
+ "rule_name": "ratings_count"
+ },
+ {
+ "rule_name": "poor_sequel_check",
+ "parameters": {
+ "sequel_entity": "refdata_sequels"
+ }
+ }
+ ]
+ }
+ }
+ ```
+
+=== "rulestore"
+
+ ```json title="movies_rulestore.json"
+ {
+ "ratings_count": {
+ "description": "Ensure more than 1 rating",
+ "type": "complex_rule",
+ "parameter_descriptions": {
+ "entity": "The entity to apply the workflow to."
+ },
+ "parameter_defaults": {},
+ "rule_config": {
+ "rules": [
+ {
+ "name": "Get count of ratings",
+ "operation": "add",
+ "entity": "{{entity}}",
+ "column_name": "no_of_ratings",
+ "expression": "length(ratings)"
+ }
+ ],
+ "filters": [
+ {
+ "name": "filter_too_few_ratings",
+ "entity": "{{entity}}",
+ "expression": "no_of_ratings > 1",
+ "error_code": "LIMITED_RATINGS",
+ "reporting_field": "title",
+ "failure_message": "Movie has too few ratings ({{ratings}})"
+ }
+ ],
+ "post_filter_rules": [
+ {
+ "name": "Remove the no_of_ratings field",
+ "operation": "remove",
+ "entity": "{{entity}}",
+ "column_name": "no_of_ratings"
+ }
+ ]
+ }
+ },
+ "poor_sequel_check": {
+ "description": "check if bad sequel exists",
+ "type": "complex_rule",
+ "parameter_descriptions": {
+ "entity": "The entity to apply the workflow to.",
+ "sequel_entity": "The entity containing sequel data"
+ },
+ "parameter_defaults": {},
+ "rule_config": {
+ "rules": [
+ {
+ "name": "Join sequel data",
+ "operation": "inner_join",
+ "entity": "{{entity}}",
+ "target": "{{sequel_entity}}",
+ "join_condition": "{{entity}}.title = {{sequel_entity}}.sequel_to",
+ "new_entity_name": "with_sequels",
+ "new_columns": {
+ "{{sequel_entity}}.ratings": "sequel_rating"
+ }
+ },
+ {
+ "name": "Get median sequel rating",
+ "operation": "group_by",
+ "entity": "with_sequels",
+ "group_by": "title",
+ "agg_columns": {
+ "list_aggregate(sequel_rating, 'median')": "median_sequel_rating"
+ }
+ }
+
+ ],
+ "filters": [
+ {
+ "name": "filter_rubbish_sequel",
+ "entity": "with_sequels",
+ "expression": "median_sequel_rating > 5",
+ "error_code": "RUBBISH_SEQUEL",
+ "reporting_entity": "derived",
+ "reporting_field": "title",
+ "failure_message": "The movie {{title}} has a rubbish sequel",
+ "is_informational": true
+ }
+ ],
+ "post_filter_rules": [
+ {
+ "name": "Remove the with_sequel entity",
+ "operation": "remove_entity",
+ "entity": "with_sequels"
+ }
+ ]
+ }
+ }
+ }
+ ```
+
+For all complex rules, you must set the key `"type"` to "complex_rule". Description is optional but future you will thank you when there is a quick explantation explaining what the rule is doing.
+
+After that, you define the `"rule_config"` key which defines the [Rules](./business_rules.md#rules), [Filters](./business_rules.md#) and [Post Rule steps](./business_rules.md#post-rule) to be applied.
+
+The sections below will cover the unique elements in a complex rule not already covered in the previous sections.
+
+### Parameters
+
+Parameters have two scopes. "Global" and "local".
+
+"Global" parameters can be defined as a new key under the `"transformations"` section. These can contain variables accessible by every single rule and filter.
+
+"Local" parameters are defined during the setup of a [Complex Rule](./business_rules.md#complex-rules).
+
+Below is an example showing how you would define them:
+
+=== "Global Example"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ ...
+ },
+ "transformations": {
+ "parameters": {
+ "param_name": "value",
+ "param_name2": "value",
+ ...
+ },
+ ...
+ }
+ }
+ ```
+
+=== "Local Example"
+
+ === "dischema"
+
+ ```json title="movies.dischema.json"
+ {
+ "contract": {
+ ...
+ },
+ "transformations": {
+ "complex_rules": [
+ {
+ "rule_name": "my_complex_rule",
+ "parameters": {
+ "param_key1": "value",
+ "param_key2": "value",
+ ...
+ }
+ }
+ ],
+ ...
+ }
+ }
+ ```
+
+ === "Rulestore"
+
+ ```json title="movies_rulestore.json"
+ {
+ "my_complex_rule": {
+ "parameter_descriptions": {
+ "param_key1": "required for x,y,z reason",
+ "param_key2": "lorem ipsum",
+ },
+ "parameter_defaults": {
+ "param_key2": "hello world"
+ },
+ "rule_config": {
+ ...
+ }
+ }
+ }
+ ```
+
+### Rule Stores
+
+Rule stores are seperate JSON documents that you can load into the dischema document. The benefit of building rulestores are that you can reutilise them across multiple dischema documents.
+
+To add a new rulestore simply add a new key called `"rule_stores"` under the transformation section. For example:
+
+```json title="movies.dischema.json"
+ {
+ "contract": {
+ ...
+ },
+ "transformations": {
+ "rulestores": [
+ {
+ "store_type": "json",
+ "filename": ".json"
+ },
+ ...
+ ]
+ ...
+ }
+ }
+```
diff --git a/docs/user_guidance/data_contract.md b/docs/user_guidance/data_contract.md
new file mode 100644
index 0000000..b49e12f
--- /dev/null
+++ b/docs/user_guidance/data_contract.md
@@ -0,0 +1,449 @@
+---
+title: Data Contract
+tags:
+ - Contract
+ - Data Contract
+ - Domain Types
+---
+
+The Data Contract defines the structure (models) of your data and controls how it is typecast. We use [Pydantic](https://docs.pydantic.dev/1.10/) to generate and validate the models. This page is meant to give you greater details on how you should write your Data Contract. If you want a summary of how the Data Contract works, please refer to the [Getting Started](./getting_started.md#rules-configuration-introduction) page.
+
+!!! Note
+
+ We plan to migrate to Pydantic v2+ in a future release. This page currently reflects what is available through Pydantic v1.
+
+## Models
+
+The models within the Data Contract are written under the `datasets` key. For example, this is how you might define a model for a movies dataset:
+
+=== "movies.dischema.json"
+
+ ```json
+ {
+ "datasets": {
+ "movie": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": "str",
+ "year_released": "conformatteddate",
+ "genres": {
+ "type": "str",
+ "is_array": true
+ }
+ }
+ },
+ "cast": {
+ "fields": {
+ "actor_id": "int",
+ "actor_forename": "str",
+ "actor_surname": "str",
+ "character_name": "",
+ "movies_acted": {
+ "type": "int",
+ "is_array": true
+ }
+ }
+ }
+ }
+ }
+ ```
+
+=== "movies.json"
+
+ ```json
+ {
+ "movie_uuid": 1,
+ "movie_name": "John Doe & The Giant Peach",
+ "year_released": "1964-01-01",
+ "genres": [
+ "thriller",
+ "action",
+ "horror"
+ ],
+ "cast": {
+ "actor_forename": "John",
+ "actor_surname": "Doe",
+ "character_name": "John Doe",
+ "movies_acted": [
+ 1
+ ]
+ }
+ }
+ ```
+
+From the example above, we've built two models from the source data which in turn will provide two seperated entities to work with in the business rules and how the data will be written out at the end of the process. Those models being `"movie"` and `"cast"` with `fields` specifying the name of the columns and the data type they should be cast to. We will look into [data types later in this page](data_contract.md#types).
+
+
+### Mandatory Fields
+
+Within the Data Contract you can also specify `mandatory fields`. These are fields that must be present in the submitted data or a [Feedback Message](./feedback_messages.md) will be generated stating that the field is missing. You can define `mandatory fields` like this...
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "datasets": {
+ "movie": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": "str",
+ "year_released": "conformatteddate",
+ "genres": {
+ "type": "str",
+ "is_array": true
+ }
+ },
+ "required_fields": [
+ "movie_uuid",
+ "movie_name"
+ ]
+ },
+ "cast": {
+ "fields": {
+ "actor_id": "int",
+ "actor_forename": "str",
+ "actor_surname": "str",
+ "character_name": "",
+ "movies_acted": {
+ "type": "int",
+ "is_array": true
+ }
+ },
+ "required_fields": [
+ "actor_id",
+ "actor_forename",
+ "actor_surname"
+ ]
+ }
+ }
+ }
+}
+```
+
+### Key Fields
+
+You can define a `key_field` or `key_fields` within a given entity. These represent the unique identifiers within your dataset. `key_field` represents a single unique identifier, whereas `key_fields` allows a combination of fields to represent a unique record.
+
+This can be defined within the dischema like...
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "datasets": {
+ "movie": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": "str",
+ ...
+ },
+ "key_fields": [
+ "movie_uuid",
+ "movie_name"
+ ]
+ },
+ "cast": {
+ "fields": {
+ "actor_id": "int",
+ ...
+ },
+ "key_field": "actor_id"
+ }
+ }
+ }
+}
+```
+
+### Readers
+
+You can define a reader for each specific model. You can have multiple readers if your incoming data is in multiple formats (e.g. csv & json). Here is an example of adding readers to our movie dataset example:
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "datasets": {
+ "movie": {
+ "fields": {
+ ...
+ },
+ "mandatory_fields": {
+ ...
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "DuckDBJSONReader",
+ "kwargs": {
+ "encoding": "utf-8",
+ "multi_line": true,
+ }
+ }
+ }
+ },
+ "cast": {
+ "fields": {
+ ...
+ },
+ "mandatory_fields": {
+ ...
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "DuckDBJSONReader",
+ "kwargs": {
+ "encoding": "utf-8",
+ "multi_line": true,
+ }
+ }
+ }
+ }
+ }
+ }
+}
+```
+
+If you want to read more about the readers, please see the [File Transformation](./file_transformation.md) page.
+
+
+## Types
+
+Within the `fields` section of the contract you must define what data type a given field should be. Depending on how strict/lenient you want your types to be, a number of types are available to use. The types available are:
+
+- [Built-in standard library](https://docs.python.org/3.11/library/stdtypes.html) types (such as `int`, `str`, `date`) available with your version of Python installed for the DVE.
+- [Pydantic v1 types](https://docs.pydantic.dev/1.10/usage/types/)
+- [Custom Types](./data_contract.md#custom-types)
+- [Domain types](./data_contract.md#domain-types)
+
+### Constraints
+
+Given the DVE supports Pydantic types, you can use any of the [constrained types available](https://docs.pydantic.dev/1.10/usage/types/#constrained-types). The Pydantic docs will also show you what `kwarg` arguments are available for each constraint such as min/max length, regex patterns etc.
+
+For example, if you wanted to use a `constr` type for a field, you would define it like this:
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "datasets": {
+ "movie": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": {
+ "callable": "constr",
+ "constraints": {
+ "min_length": 1,
+ "max_length": 20
+ }
+ },
+ ...
+ }
+ }
+ }
+ }
+}
+```
+
+In the example above we would be ensuring that the movie name is between 1 & 20 characters. If it is less than 1, or more than 20, a [Feedback Message](./feedback_messages.md) will be produced.
+
+### Custom Types
+
+As shown in the [Constraints](./data_contract.md#constraints) section above, you may want to apply the same constraints to many fields. A better way to define this rather than rewriting the constraints repeatedly for each field, is to define a custom type under the `types` key within the `contract` section. You can define a custom type like this:
+
+```json title="movies.dischema.json"
+{
+ "types": {
+ "MyConstrainedString": {
+ "callable": "constr",
+ "constraints": {
+ "min_length": 1,
+ "max_length": 20
+ }
+ }
+ },
+ "contract": {
+ "datasets": {
+ "movie": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": "MyConstrainedString",
+ ...
+ }
+ },
+ "cast": {
+ "fields": {
+ "actor_id": "int",
+ "actor_forename": "MyConstrainedString",
+ "actor_surname": "MyConstrainedString",
+ ...
+ }
+ }
+ }
+ }
+}
+```
+
+As you can see, we can set the "type" for several fields to `MyConstrainedString` which has a min & max length constraint.
+
+#### Domain Types
+
+Domain types are custom Pydantic model types available with the DVE. Current Domain types available are `Postcode`, `NHSNumber`, `FormattedDatetime` etc. You can find the full list of Domain Types [here](../advanced_guidance/package_documentation/domain_types.md).
+
+### Complex Types
+
+DVE supports the ability to define complex types such as `arrays`, `structs`, arrays of structs etc.
+
+To define a struct type, you would add it to the `types` section like this...
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "types": {
+ "Actor": {
+ "actor_id": "int",
+ "actor_forename": "str",
+ "actor_surname": "str",
+ "character_name": "",
+ "movies_acted": {
+ "type": "int",
+ "is_array": true
+ }
+ }
+ },
+ "datasets": {
+ ...
+ }
+ }
+}
+```
+
+... and then you can simply add a new field with `model` set to the new type and `is_array` equal to `true` (or `false` if you just want a struct).
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "types": {
+ "Actor": {
+ "actor_id": "int",
+ "actor_forename": "str",
+ "actor_surname": "str",
+ "character_name": "",
+ "movies_acted": {
+ "type": "int",
+ "is_array": true
+ }
+ }
+ },
+ "datasets": {
+ "movie": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": {
+ "callable": "constr",
+ "constraints": {
+ "min_length": 1,
+ "max_length": 20
+ }
+ },
+ "actors": {
+ "model": "Actor",
+ "is_array": true
+ }
+ }
+ }
+ }
+ }
+}
+```
+
+If you just want to turn a simple type into an array, simply set `is_array` to `true`. E.g.
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "datasets": {
+ "cast": {
+ "fields": {
+ "movies_acted": {
+ "type": "int",
+ "is_array": true
+ }
+ }
+ }
+ }
+ }
+}
+```
+
+## Error Categories
+
+As mentioned earlier, when a field...
+
+- cannot be correctly type casted
+- breaks the constraints of the type
+- is missing when mandatory
+
+... a [Feedback Message](./feedback_messages.md) will be produced. Each error raised, will be categorised into one of...
+
+| Category | Meaning |
+| -------- | ------- |
+| Blank | The value is missing |
+| Wrong format | The value could not be casted into the defined type.
I.e. str -> date, str -> int etc |
+| Bad value | The value broke one of the constraints |
+
+## Custom Error Details
+
+When a [Feedback Message](./feedback_messages.md) is produced during the contract a number of default error codes and messages are utilised. If you need to overhaul the error code and error message, you can create a custom contract error details `JSON` document. It can be setup in the following way:
+
+=== "movie.dischema.json"
+
+ ```json
+ {
+ "contract": {
+ "error_details": "movie_data_contract_details.json",
+ "datasets": {
+ "fields": {
+ "movie_uuid": "int",
+ "movie_name": "str",
+ ...
+ }
+ }
+ }
+ }
+ ```
+
+=== "movie_data_contract_details.json"
+
+ ```json
+ {
+ "movie_uuid": {
+ "Blank": {
+ "error_code": "MOVIE_UUID_01",
+ "error_message": "File Rejected - movie_uuid is blank."
+ },
+ "Bad Value": {
+ "error_code": "MOVIE_UUID_02",
+ "error_message": "File Rejected - movie_uuid has an incorrect data format. movie_uuid={{ movie_uuid }}."
+ }
+ },
+ "movie_name": {
+ "Bad Value": {
+ "error_code": "MOVIE_NAME_01",
+ "error_message": "File Rejected - movie_name has an incorrect data format. movie_name={{ movie_name }}."
+ }
+ }
+ }
+ ```
+
+!!! Warning
+
+ The contract details document must be in the same directory as the dischema document.
+
+
+## Cache Originals
+
+This setting allows you to retain a copy of the original entities (as defined within the dischema) before the business rules are applied. This is set to `False` by default. To enable it, simply add the following to your dischema document:
+
+```json title="movies.dischema.json"
+{
+ "contract": {
+ "cache_originals": true,
+ ...
+ }
+}
+```
diff --git a/docs/user_guidance/error_reports.md b/docs/user_guidance/error_reports.md
new file mode 100644
index 0000000..79c3be3
--- /dev/null
+++ b/docs/user_guidance/error_reports.md
@@ -0,0 +1,31 @@
+---
+title: Error Reports
+tags:
+ - Feedback
+ - Messages
+ - Error Reports
+---
+
+
+As mentioned in the [Introduction](../index.md) a fourth optional component is offered with the DVE. This is known as the Error Reports. This step will collate the [Feedback Messages](./feedback_messages.md) and populate them into an spreadsheet document.
+
+The Error Report will be available for each submission under `error_report/` folder.
+
+## Summary
+Contains metadata around the submission and whether it was successful.
+
+
+
+## Error Summary
+Contains an aggregation of all the errors and warnings that have occured and how many times they occured for that submission.
+
+
+
+## Error Data
+Provides a breakdown of every single error that occured within a submission.
+
+
+
+!!! note
+
+ The images above were generated from our movies test dataset. You can view the rules and data [here](https://github.com/NHSDigital/data-validation-engine/tree/main/tests/testdata/movies).
diff --git a/docs/user_guidance/feedback_messages.md b/docs/user_guidance/feedback_messages.md
new file mode 100644
index 0000000..ee81288
--- /dev/null
+++ b/docs/user_guidance/feedback_messages.md
@@ -0,0 +1,31 @@
+---
+title: Feedback Messages
+tags:
+ - Feedback
+ - Messages
+---
+
+During the processing of a submission through the DVE, Feedback Messages will be produced.
+
+These messages are generated when...
+
+1. the data is structurely incorrect during the [File Transformation](./file_transformation.md) stage.
+2. the data has failed during the modelling and casting steps during the [Data Contract](./data_contract.md) stage.
+3. the data has failed one of the validation rules defined during the [Business Rules](./business_rules.md) stage.
+
+The messages are compiled into a `jsonl` file associated with the stage it failed in.
+
+The `jsonl` files produced will be in the same folder as your submission under a folder called `errors/`.
+
+## Processing Errors
+
+In situations where the DVE cannot continue (critical failure), a message will be produced and stored in the submissions folder under a name called `processing_errors/`.
+
+The processing error `jsonl` file will contain information regarding why the DVE could not continue.
+
+If the DVE is crashing and the error message is either unreadable or is crashing when it should be working, please [raise an issue](https://github.com/NHSDigital/data-validation-engine/issues) on our GitHub.
+
+
+## Feedback Message Models
+
+Please refer to [Advanced User Guidance: Feedback Messages](../advanced_guidance/package_documentation/feedback_messages.md).
diff --git a/docs/user_guidance/file_transformation.md b/docs/user_guidance/file_transformation.md
new file mode 100644
index 0000000..41dc7de
--- /dev/null
+++ b/docs/user_guidance/file_transformation.md
@@ -0,0 +1,166 @@
+---
+title: File Transformation
+tags:
+ - Contract
+ - Data Contract
+ - File Transformation
+ - Readers
+---
+
+The File Transformation stage within the DVE is used to convert submitted files to stringified parquet format. This is critical as the rest of the stages within the DVE are reliant on the data being in parquet format. [Parquet was chosen as it's a very efficient column oriented format](https://www.databricks.com/glossary/what-is-parquet). When specifying which formats you are expecting, you will define it in your dischema like this:
+
+=== "DuckDB"
+
+ ```json
+ {
+ "contract": {
+ "datasets": {
+ "": {
+ "fields": {
+ ...
+ },
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "DuckDBJSONReader",
+ "kwargs": {
+ ...
+ }
+ },
+ ".xml": {
+ "reader": "DuckDBXMLStreamReader",
+ "kwargs": {
+ ...
+ }
+ }
+ }
+ }
+ }
+ }
+ ```
+
+=== "Spark"
+
+ ```json
+ {
+ "contract": {
+ "datasets": {
+ "": {
+ "fields": {
+ ...
+ },
+ },
+ "reader_config": {
+ ".csv": {
+ "reader": "SparkCSVReader",
+ "kwargs": {
+ ...
+ }
+ },
+ ".json": {
+ "reader": "SparkJSONReader",
+ "kwargs": {
+ ...
+ }
+ }
+ }
+ }
+ }
+ }
+ ```
+
+The secondary use of the File Transformation stage is the ability to normalise your data into multiple entities. Imagine you had something like Hospital and Patient data in a single submission. You could split this out into seperate entities so that the validated outputs of the data could be loaded into seperate tables (parquet). For example:
+
+=== "DuckDB"
+
+ ```json
+ {
+ "contract": {
+ "datasets": {
+ "hospital": {
+ "fields": {
+ "hospital_id": "int",
+ "hospital_name": "string"
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "DuckDBJSONReader",
+ "kwargs": {
+ "encoding": "utf-8",
+ "multi_line": true,
+ }
+ }
+ }
+ },
+ "patients": {
+ "fields": {
+ "patient_id": "int",
+ "patient_name": "string"
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "DuckDBJSONReader",
+ "kwargs": {
+ "encoding": "utf-8",
+ "multi_line": true,
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ ```
+
+
+=== "Spark"
+
+ ```json
+ {
+ "contract": {
+ "datasets": {
+ "hospital": {
+ "fields": {
+ "hospital_id": "int",
+ "hospital_name": "string"
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "SparkJSONReader",
+ "kwargs": {
+ "encoding": "utf-8",
+ "multi_line": true,
+ }
+ }
+ }
+ },
+ "patients": {
+ "fields": {
+ "patient_id": "int",
+ "patient_name": "string"
+ },
+ "reader_config": {
+ ".json": {
+ "reader": "SparkJSONReader",
+ "kwargs": {
+ "encoding": "utf-8",
+ "multi_line": true,
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ ```
+
+!!! abstract ""
+ You can read more about the readers and kwargs [here](../advanced_guidance/package_documentation/readers.md).
+
+## Supported Formats
+
+| Format | DuckDB | Spark | Version Available |
+| ------- | ------------------ | ------------------ | ----------------- |
+| `.csv` | :white_check_mark: | :white_check_mark: | >= 0.1.0 |
+| `.json` | :white_check_mark: | :white_check_mark: | >= 0.1.0 |
+| `.xml` | :white_check_mark: | :white_check_mark: | >= 0.1.0 |
diff --git a/docs/user_guidance/getting_started.md b/docs/user_guidance/getting_started.md
new file mode 100644
index 0000000..e938c2d
--- /dev/null
+++ b/docs/user_guidance/getting_started.md
@@ -0,0 +1,119 @@
+---
+title: Getting Started
+tags:
+ - Introduction
+ - Data Contract
+ - Business Rules
+---
+
+
+## Rules Configuration Introduction
+
+To use the DVE you will need to create a dischema document. The dischema document describes how the DVE should validate your data. It's divided into two primary parts. The first part is the `contract` (data contract) - this defines the structure of your data and determines how it is modeled and typecast. For example, here is a dischema document describing how the DVE may validate data about movies:
+
+!!! example "Example `movies.dischema.json`"
+
+ ```json
+ {
+ "contract": {
+ "schemas": {
+ "cast": {
+ "fields": {
+ "name": "str",
+ "role": "str",
+ "date_joined": "date"
+ }
+ }
+ },
+ "datasets": {
+ "movies": {
+ "fields": {
+ "title": "str",
+ "year": "int",
+ "genre": {
+ "type": "str",
+ "is_array": true
+ },
+ "duration_minutes": "int",
+ "ratings": {
+ "type": "NonNegativeFloat",
+ "is_array": true
+ },
+ "cast": {
+ "model": "cast",
+ "is_array": true
+ }
+ },
+ },
+ "mandatory_fields": [
+ "title",
+ "year"
+ ],
+ "reader_config": {
+ ".json": {
+ "reader": "SparkJSONReader"
+ }
+ }
+ }
+ }
+ }
+ ```
+
+Within the example above, there are two parent keys - `schemas` and `datasets`.
+
+`schemas` allow you to define custom complex data types. So, in the example above, the field `cast` would be expecting an array of structs containing the actors name, role and the date they joined the movie.
+
+`datasets` describe the actual models for the entities you want to load. In the example above, we only want to load a single entity called `movies` which contains the fields `title, year, genre, duration_minutes, ratings and cast`. However, you could load the complex type `cast` into a seperate entity if you wanted. This can be useful in situations where a given entity has all the information you need to perform a given validation rule against, making the performance of rule faster & more efficient as there's less data to scan in a given entity.
+
+!!! note
+ The "splitting" of entities is considerably more useful in situtations where you want to normalise/de-normalise your data. If you're unfamiliar with this concept, you can read more about it [here](https://en.wikipedia.org/wiki/Database_normalization). However, you should keep in mind potential performance impacts of doing this. If you have rules that requires fields from different entities, you will have to perform a `join` between the split entities to be able to perform the rule.
+
+For each dataset definition, you will need to provide a `reader_config` which describes how to load the data during the [File Transformation](file_transformation.md) stage. So, in the example above, we expect `movies` to come in as a `JSON` file. However, you can add more readers if you have the same data in different data formats (e.g. `csv`, `xml`, `json`). Regardless of what file format, the [File Transformation](file_transformation.md) stage will convert the submitted data into a "stringified" parquet format which is a requirement for the subsequent stages.
+
+To learn more about how you can construct your Data Contract please read [here](data_contract.md).
+
+The second part of the dischema are the `tranformations` (Business Rules). This section describes the validation rules you want to apply to entities defined within the `contract`. For example, with our `movies` dataset above, we may want to check that movies in this dataset are less than 4 hours long. The expression to write this check is written in SQL and that syntax may change slightly depending on the SQL backend you've chosen (we currently support [DuckDB](implementations/duckdb.md) and [Spark SQL](implementations/spark.md)).
+!!! example "Example `movies.dischema.json`"
+
+ ```json
+ {
+ "transformations": {
+ "filters":{
+ {
+ "entity": "movies",
+ "name": "Ensure movie is less than 4 hours long",
+ "expression": "duration_minutes > 240",
+ "error_code": "MOVIE_TOO_LONG",
+ "failure_message": "Movie must be less than 4 hours long."
+ }
+ }
+ }
+ }
+ ```
+You may look at the expression above and think "Hang on! That's the opposite of what you want! You're only getting movies less than 4 hours!", __however, all validation rules are wrapped inside a `NOT` expression__. So, you write the rules as though you are looking for non problematic values.
+
+We also offer a feature called `complex_rules`. These are rules where you need to transform the data before you can apply the rule. For instance, you may want to perform a join, aggregate the data, or perform a filter. The complex rules allow you to combine "pre-steps" before you perform the validation.
+
+To learn more about how to write your validation rules and complex validation rules, please follow the guidance [here](business_rules.md).
+
+
+## Utilising the Pipeline objects to run the DVE
+Within the DVE package, we have created the ability to build pipeline objects to help orchestrate the running of the DVE from start to finish. We currently have an implementation for `Spark` and `DuckDB` ready for users to use out of the box. The links below will direct you to detailed guidance on how you can setup a DVE pipeline.
+
+
+
+- :material-duck:{ .lg .middle } __Set up with DuckDB__
+
+ ---
+
+ [:octicons-arrow-right-24: Setup a DuckDB pipeline here](implementations/duckdb.md)
+
+- :material-shimmer:{ .lg .middle } __Set up with Spark__
+
+ ---
+
+ [:octicons-arrow-right-24: Setup a Spark pipeline here](implementations/spark.md)
+
+
+
+
diff --git a/docs/user_guidance/implementations/duckdb.md b/docs/user_guidance/implementations/duckdb.md
new file mode 100644
index 0000000..ac75c58
--- /dev/null
+++ b/docs/user_guidance/implementations/duckdb.md
@@ -0,0 +1,174 @@
+!!! quote
+ DuckDB is a high-performance analytical database system. It is designed to be fast, reliable, portable, and easy to use. DuckDB provides a rich SQL dialect with support far beyond basic SQL. DuckDB supports arbitrary and nested correlated subqueries, window functions, collations, complex types (arrays, structs, maps), and several extensions designed to make SQL easier to use.
+
+ DuckDB is available as a standalone CLI application and has clients for Python, R, Java, Wasm, etc., with deep integrations with packages such as pandas and dplyr.
+
+You can read more about DuckDB with the following links:
+
+- [Official Documentation :material-file-document-arrow-right:](https://duckdb.org/docs/stable/)
+- [GitHub :material-github:](https://github.com/duckdb/duckdb)
+
+## Setting up a DuckDB Connection
+
+To be able to use DuckDB with the DVE you first need to create a DuckDB connection object. You can simply do this with the following code:
+
+=== "Persist Database on memory"
+ ```py
+ import duckdb as ddb
+
+ db_path = ":memory:"
+ db_con = ddb.connect(db_path)
+ ```
+
+=== "Persist Database on disk"
+ ```py
+ import duckdb as ddb
+
+ db_path = "path/to/my_database.duckdb"
+ db_con = ddb.connect(db_path)
+ ```
+
+!!! note
+ You will need to close the db_con object with `db.close()`. Alternatively, you could build a custom [context manager](https://docs.python.org/3/library/contextlib.html) object to open and close the connection without needing to explicitly close the connection.
+
+
+Now you have the DuckDB connection object setup, you are ready to setup the required DVE objects.
+
+## Generating SubmissionInfo objects
+
+Before we utilise the DVE, we need to generate an iterable object containing `SubmissionInfo` objects. These objects effectively contain the necessery metadata for the DVE to work with a given submission. Here is an example function used to generate [SubmissionInfo](../../advanced_guidance/package_documentation/models.md#dve.core_engine.models.SubmissionInfo) objects from a given path:
+```py
+import glob
+from datetime import date, datetime
+from pathlib import Path
+from typing import Optional
+from uuid import uuid4
+
+from dve.core_engine.models import SubmissionInfo
+
+
+def generate_sub_infos_from_submissions_path(
+ submission_path: Path,
+ dataset_id: Optional[str] = "example",
+ submitting_org: Optional[str] = None,
+ submission_method: Optional[str] = "local_test",
+ reporting_period_start_date: Optional[date | datetime] = None,
+ reporting_period_end_date: Optional[date | datetime] = None,
+) -> list[SubmissionInfo]:
+ sub_infos: list[SubmissionInfo] = []
+ for f in glob.glob(str(submission_path) + "/*.*"):
+ file_path = Path(f)
+ file_stats = file_path.stat()
+ metadata = {
+ "dataset_id": dataset_id,
+ "file_name": file_path.stem,
+ "file_extension": file_path.suffix,
+ "submission_method": submission_method,
+ "file_size": file_stats.st_size,
+ "datetime_received": datetime.now(),
+ }
+ if submitting_org:
+ metadata["submitting_org"] = submitting_org
+ if reporting_period_start_date:
+ metadata["reporting_period_start"] = str(reporting_period_start_date)
+ if reporting_period_end_date:
+ metadata["reporting_period_end"] = str(reporting_period_end_date)
+
+ sub_infos.append(SubmissionInfo(submission_id=uuid4().hex, **metadata))
+ return sub_infos
+
+
+submissions = generate_sub_infos_from_submissions_path(Path("path", "to", "my", "submissions"))
+```
+
+!!! note
+ If you have a large number of submissions, it may be worth converting the above into a [generator](https://docs.python.org/3/reference/expressions.html#generator-expressions). Using the example above, you can do this by simply removing the sub_infos object and yield the SubmissionInfo object per file returned from the glob iterator.
+
+## DuckDB Audit Table Setup
+
+The first object you must setup is an "Audit Manager Object". This can be done with the following code:
+
+```py
+from dve.core_engine.backends.implementations.duckdb.auditing import DDBAuditingManager
+
+audit_manager = DDBAuditingManager(db_path, connection=db_con) # type: ignore
+```
+
+The "Audit Manager" object within the DVE is used to keep track of the status of your submission. A submission for instance could fail during the File Transformation section, so it's important that we have something to keep track of the submission. The Audit Manager object has a number of methods that can be used to read/write information to tables being stored within the duckdb connection setup in the previous step.
+
+You can learn more about the Auditing Objects [here](../auditing.md).
+
+Once you have setup your "Audit Manager" object, we can move onto setting up the DuckDB reference data loader (if required) and then setting up the DuckDB DVE Pipeline object.
+
+## DuckDB Reference Data Setup (Optional)
+If your business rules are reliant on utilising reference data, you will need to write the following code to ensure that reference data can be loaded during the application of those rules:
+
+```py
+from dve.core_engine.backends.implementations.duckdb.reference_data import DuckDBRefDataLoader
+
+DuckDBRefDataLoader.connection = db_con
+DuckDBRefDataLoader.dataset_config_uri = Path("path", "to", "my", "rules").as_posix()
+```
+
+The connection passed into the `DuckDBRefDataLoader` object will then be able to use various DuckDB readers to load data from an existing table on the connection OR loading data from reference data persisted in either `parquet` or `pyarrow` format.
+
+If you want to learn more about the reference data loaders, you can view the advanced user guidance [here](../../advanced_guidance/package_documentation/refdata_loaders.md).
+
+Now we can move onto setting up the DuckDB DVE Pipeline object.
+
+## DuckDB Pipeline Setup
+
+To setup a DuckDB Pipeline, you can use the following example below:
+
+=== "Without Rules"
+
+ ```py
+
+ from dve.pipeline.duckdb_pipeline import DDBDVEPipeline
+
+
+ dve_pipeline = DDBDVEPipeline(
+ processed_files_path=Path("location_to_store", "dve_outputs").as_posix(),
+ audit_tables=audit_manager,
+ connection=db_con,
+ submitted_files_path=Path("submissions", "path").as_posix(),
+ reference_data_loader=DuckDBRefDataLoader,
+ )
+ ```
+
+=== "With Rules"
+
+ ```py
+ from dve.pipeline.duckdb_pipeline import DDBDVEPipeline
+
+
+ dve_pipeline = DDBDVEPipeline(
+ processed_files_path=Path("location_to_store", "dve_outputs").as_posix(),
+ audit_tables=audit_manager,
+ connection=db_con,
+ rules_path=Path("to", "my", "rules").as_posix(),
+ submitted_files_path=Path("submissions", "path").as_posix(),
+ reference_data_loader=DuckDBRefDataLoader,
+ )
+ ```
+
+!!! note
+ If using remote resources, then you will want to use `as_uri` for your paths.
+
+ E.g.
+ ```py
+ Path("remote", "path").as_uri()
+ ```
+
+Once your Pipeline object is defined, you can simply run the `cluster_pipeline_run` method. E.g.
+
+```py
+error_reports = dve_pipeline.cluster_pipeline_run()
+```
+
+
+## Further documentation
+For further details on the objects referenced above, you can use the following links to read more about the objects:
+
+- [Pipeline Docs](../../advanced_guidance/package_documentation/pipeline.md)
+- [Reference Data Docs](../../advanced_guidance/package_documentation/refdata_loaders.md)
diff --git a/docs/user_guidance/implementations/mixing_implementations.md b/docs/user_guidance/implementations/mixing_implementations.md
new file mode 100644
index 0000000..dc75aeb
--- /dev/null
+++ b/docs/user_guidance/implementations/mixing_implementations.md
@@ -0,0 +1,30 @@
+
+## Mixing backend implementations
+
+The examples shown above are using the Spark Backend. DVE also has a DuckDB backend found at [core_engine.backends.implementations.duckdb](https://github.com/NHSDigital/data-validation-engine/tree/main/src/dve/core_engine/backends/implementations/duckdb). In order to mix the two you will need to convert from one type of entity to the other. For example from a spark `Dataframe` to DuckDB `relation`. The easiest way to do this is to use the `write_parquet` method from one backend and use `read_parquet` from another backend.
+
+Currently the configuration isn't backend agnostic for applying business rules. So if you want to swap between spark and duckdb, the business rules need to be written using only features that are common to both backends. For example, a regex check in spark would be something along the lines of...
+```sql
+nhsnumber rlike '^\d{10}$'
+```
+...but in duckdb it would be...
+```sql
+regexp_matches(nhsnumber, '^\d{10}$')
+```
+Failures in parsing the expressions lead to failure messages such as
+```python
+FeedbackMessage(
+ entity=None,
+ record=None,
+ failure_type='integrity',
+ is_informational=False,
+ error_type=None,
+ error_location=None,
+ error_message="Unexpected error (AnalysisException: Undefined function: 'regexp_matches'. This function is neither a registered temporary function nor a permanent function registered in the database 'default'.; line 1 pos 5) in transformations (rule: root; step: 0; id: None)",
+ error_code=None,
+ reporting_field=None,
+ reporting_field_name=None,
+ value=None,
+ category=None
+)
+```
\ No newline at end of file
diff --git a/docs/user_guidance/implementations/platform_specific/databricks.md b/docs/user_guidance/implementations/platform_specific/databricks.md
new file mode 100644
index 0000000..e478c0d
--- /dev/null
+++ b/docs/user_guidance/implementations/platform_specific/databricks.md
@@ -0,0 +1,10 @@
+## Installation
+
+Firstly, please ensure that you've read the guidance on our [installation section](../../install.md).
+
+You can follow these guides to help you install the Data Validation Engine onto a Databricks Cluster:
+
+- [AWS](https://docs.databricks.com/aws/en/libraries/)
+- [GCP](https://docs.databricks.com/gcp/en/libraries/)
+- [Microsoft Azure](https://learn.microsoft.com/en-us/azure/databricks/libraries/)
+
diff --git a/docs/user_guidance/implementations/platform_specific/palantir_foundry.md b/docs/user_guidance/implementations/platform_specific/palantir_foundry.md
new file mode 100644
index 0000000..1c63d00
--- /dev/null
+++ b/docs/user_guidance/implementations/platform_specific/palantir_foundry.md
@@ -0,0 +1,2 @@
+!!! note
+ This section has not yet been written. Coming soon.
diff --git a/docs/user_guidance/implementations/spark.md b/docs/user_guidance/implementations/spark.md
new file mode 100644
index 0000000..23b82d9
--- /dev/null
+++ b/docs/user_guidance/implementations/spark.md
@@ -0,0 +1,190 @@
+!!! quote
+ Apache Spark™ is a multi-language engine for executing data engineering, data science, and machine learning on single-node machines or clusters.
+
+You can read more about Spark here with the following links:
+
+- [Official Documentation :material-file-document-arrow-right:](https://spark.apache.org/)
+- [GitHub :material-github:](https://github.com/apache/spark)
+
+
+## Setting up a Spark Session
+
+!!! note
+
+ The Audit Tables require delta package available with Spark. The example below will include that.
+
+For a minimal working Spark Session setup with DVE, you can use the following snippet of code:
+```py
+import os
+import tempfile
+from pyspark.sql import SparkSession
+
+def get_spark_session() -> SparkSession:
+ """Get a configured Spark Session. This MUST be called before any other Spark session is created."""
+ temp_dir = tempfile.mkdtemp()
+ os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join(
+ [
+ "--packages",
+ "com.databricks:spark-xml_2.12:0.16.0,io.delta:delta-core_2.12:2.4.0",
+ "pyspark-shell",
+ ]
+ )
+ spark_session = (
+ SparkSession.builder.config("spark.sql.warehouse.dir", temp_dir)
+ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
+ .config(
+ "spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"
+ )
+ .getOrCreate()
+ )
+```
+
+You can learn more about setting up a Spark Session [here](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.SparkSession.html).
+
+!!! warning
+
+ If you need to load XML data and the version of spark you're running is < 4.0.0, you'll need the `spark-xml` extension. You can read more about it [here](https://github.com/databricks/spark-xml). The snippet above shows an example of this being installed.
+
+
+## Generating SubmissionInfo Objects
+
+Before we utilise the DVE, we need to generate an iterable object containing `SubmissionInfo` objects. These objects effectively contain the necessery metadata for the DVE to work with a given submission. Here is an example function used to generate SubmissionInfo objects from a given path:
+
+```py
+import glob
+from datetime import date, datetime
+from pathlib import Path
+from typing import Optional
+from uuid import uuid4
+
+from dve.core_engine.models import SubmissionInfo
+
+
+def generate_sub_infos_from_submissions_path(
+ submission_path: Path,
+ dataset_id: Optional[str] = "example",
+ submitting_org: Optional[str] = None,
+ submission_method: Optional[str] = "local_test",
+ reporting_period_start_date: Optional[date | datetime] = None,
+ reporting_period_end_date: Optional[date | datetime] = None,
+) -> list[SubmissionInfo]:
+ sub_infos: list[SubmissionInfo] = []
+ for f in glob.glob(str(submission_path) + "/*.*"):
+ file_path = Path(f)
+ file_stats = file_path.stat()
+ metadata = {
+ "dataset_id": dataset_id,
+ "file_name": file_path.stem,
+ "file_extension": file_path.suffix,
+ "submission_method": submission_method,
+ "file_size": file_stats.st_size,
+ "datetime_received": datetime.now(),
+ }
+ if submitting_org:
+ metadata["submitting_org"] = submitting_org
+ if reporting_period_start_date:
+ metadata["reporting_period_start"] = str(reporting_period_start_date)
+ if reporting_period_end_date:
+ metadata["reporting_period_end"] = str(reporting_period_end_date)
+
+ sub_infos.append(SubmissionInfo(submission_id=uuid4().hex, **metadata))
+ return sub_infos
+
+
+submissions = generate_sub_infos_from_submissions_path(Path("path", "to", "my", "submissions"))
+```
+
+!!! note
+ If you have a large number of submissions, it may be worth converting the above into a [generator](https://docs.python.org/3/reference/expressions.html#generator-expressions). Using the example above, you can do this by simply removing the sub_infos object and yield the SubmissionInfo object per file returned from the glob iterator.
+
+## Spark Audit Table Setup
+
+The first object you must setup is an "Audit Manager Object". This can be done with the following code:
+
+```py
+from dve.core_engine.backends.implementations.spark.auditing import SparkAuditingManager
+
+db_name = "test_dve"
+spark.sql(f"CREATE DATABASE {db_name};")
+
+audit_manager = SparkAuditingManager(db_name, spark)
+```
+
+!!! note
+
+ `spark` session is optional for the `SparkAuditingManager`. If not provided a spark session will be generated.
+
+The "Audit Manager" object within the DVE is used to keep track of the status of your submission. A submission for instance could fail during the File Transformation section, so it's important that we have something to keep track of the submission. The Audit Manager object has a number of methods that can be used to read/write information to tables being stored within the duckdb connection setup in the previous step.
+
+You can learn more about the Auditing Objects [here](../auditing.md).
+
+Once you have setup your "Audit Manager" object, we can move onto setting up the Spark reference data loader (if required) and then setting up the Spark DVE Pipeline object.
+
+## Spark Reference Data Setup (Optional)
+If your business rules are reliant on utilising reference data, you will need to write the following code to ensure that reference data can be loaded during the application of those rules:
+
+```py
+from pathlib import Path
+
+from dve.core_engine.backends.implementations.spark.reference_data import SparkRefDataLoader
+
+SparkRefDataLoader.spark = spark
+SparkRefDataLoader.dataset_config_uri = Path("path", "to", "my", "rules").as_posix()
+```
+
+## Spark Pipeline Setup
+
+To setup a Spark Pipeline, you can use the following example below:
+
+=== "Without Rules"
+
+ ```py
+
+ from dve.pipeline.spark_pipeline import SparkDVEPipeline
+
+
+ dve_pipeline = SparkDVEPipeline(
+ processed_files_path=Path("location_to_store", "dve_outputs").as_posix(),
+ audit_tables=audit_manager,
+ submitted_files_path=Path("submissions", "path").as_posix(),
+ reference_data_loader=SparkRefDataLoader,
+ spark=spark,
+ )
+ ```
+
+=== "With Rules"
+
+ ```py
+ from dve.pipeline.spark_pipeline import SparkDVEPipeline
+
+
+ dve_pipeline = SparkDVEPipeline(
+ processed_files_path=Path("location_to_store", "dve_outputs").as_posix(),
+ audit_tables=audit_manager,
+ rules_path=Path("to", "my", "rules").as_posix(),
+ submitted_files_path=Path("submissions", "path").as_posix(),
+ reference_data_loader=SparkRefDataLoader,
+ spark=spark,
+ )
+ ```
+
+!!! note
+ If using remote resources, then you will want to use `as_uri` for your paths.
+
+ E.g.
+ ```py
+ Path("remote", "path").as_uri()
+ ```
+
+Once your Pipeline object is defined, you can simply run the `cluster_pipeline_run` method. E.g.
+
+```py
+error_reports = dve_pipeline.cluster_pipeline_run()
+```
+
+## Further documentation
+
+For further details on the objects referenced above, you can use the following links to read more about the objects:
+
+- [Pipeline Docs](../../advanced_guidance/package_documentation/pipeline.md)
+- [Reference Data Docs](../../advanced_guidance/package_documentation/refdata_loaders.md)
diff --git a/docs/user_guidance/install.md b/docs/user_guidance/install.md
new file mode 100644
index 0000000..00d7a9c
--- /dev/null
+++ b/docs/user_guidance/install.md
@@ -0,0 +1,88 @@
+---
+title: Installing the Data Validation Engine
+tags:
+ - Introduction
+ - Installation
+---
+
+!!! warning
+ **DVE is currently an unstable package. Expect breaking changes between every minor patch**. We intend to follow semantic versioning of `major.minor.patch` more strictly after a 1.0 release. Until then, we recommend that you pin your install to the latest version available and keep an eye on [future releases](https://github.com/NHSDigital/data-validation-engine/releases).
+
+ **Please note that we only support Python runtimes of 3.10 and 3.11.** In the future we will look to add support for Python versions greater than 3.11, but it's not an immediate priority.
+
+ If working on Python 3.7, the `0.1` release supports this (and only this) version of Python. However, we have not been updating that version with any bugfixes, performance improvements etc. There are also a number of vulnerable dependencies on version `0.1` release due to [Python 3.7 being depreciated](https://devguide.python.org/versions/) and a number of packages dropping support. **If you choose to install `0.1`, you accept the risks of doing so and additional support will not be provided.**
+
+You can install the DVE package through python package managers such as [pip](https://pypi.org/project/pip/), [pipx](https://github.com/pypa/pipx), [uv](https://docs.astral.sh/uv/) and [poetry](https://python-poetry.org/).
+
+=== "pip"
+
+ ```sh
+ pip install data-validation-engine
+ ```
+
+=== "pipx"
+
+ ```sh
+ pipx install data-validation-engine
+ ```
+
+=== "uv"
+
+ Add to your existing `uv` project...
+ ```sh
+ uv add data-validation-engine
+ ```
+
+ ...or you can add via your `pyproject.toml`...
+
+ ```toml
+ dependencies = [
+ data-validation-engine
+ ]
+ ```
+
+ ```sh
+ uv lock
+ ```
+
+ ```sh
+ uv sync
+ ```
+
+=== "poetry"
+
+ Add to your existing `poetry` project...
+ ```sh
+ poetry add data-validation-engine
+ ```
+
+ ...or you can add via your `pyproject.toml`...
+
+ ```toml
+ [tool.poetry.dependencies]
+ data-validation-engine = "*"
+ ```
+
+ ```sh
+ poetry lock
+ ```
+
+ ```sh
+ poetry install
+ ```
+
+!!! info
+ We are working on getting the DVE available via Conda. We will update this page with the relevant instructions once this has been successfully setup.
+
+Python dependencies are listed in the [`pyproject.toml`](https://github.com/NHSDigital/data-validation-engine/blob/main/pyproject.toml). Many of the dependencies are locked to quite restrictive versions due to complexity of this package. Core packages such as Pydantic, Pyspark and DuckDB are unlikely to receive flexible version constraints as changes in those packages could cause the DVE to malfunction. For less important dependencies, we have tried to make the contraints more flexible. Therefore, we would advise you to install the DVE into a seperate environment rather than trying to integrate it into an existing Python environment.
+
+Once you have installed the DVE you are almost ready to use it. To be able to run the DVE, you will need to choose one of the supported pipeline runners (see Backend implementations here - [DuckDB](user_guidance/implementations/duckdb.md) *or* [Spark](user_guidance/implementations/spark.md)) and you will need to create your own dischema document to configure how the DVE should validate incoming data. You can read more about this in the [Getting Started](getting_started.md) page.
+
+
+## DVE Version Compatability Matrix
+
+| DVE Version | Python Version | DuckDB Version | Spark Version | Pydantic Version |
+| ------------ | -------------- | -------------- | ------------- | ---------------- |
+| >=0.6 | >=3.10,<3.12 | 1.1.* | 3.4.* | 1.10.15 |
+| >=0.2,<0.6 | >=3.10,<3.12 | 1.1.0 | 3.4.4 | 1.10.15 |
+| 0.1 | >=3.7.2,<3.8 | 1.1.0 | 3.2.1 | 1.10.15 |
diff --git a/includes/jargon_and_acronyms.md b/includes/jargon_and_acronyms.md
new file mode 100644
index 0000000..9b42c4e
--- /dev/null
+++ b/includes/jargon_and_acronyms.md
@@ -0,0 +1,6 @@
+*[DVE]: Data Validation Engine
+*[dischema]: Data ingest schema
+*[stringified]: all fields casted to string
+*[kwarg]: Key Word Arguments
+*[constr]: Constrained String
+*[immutable]: Unchanging over time or unable to be changed
diff --git a/poetry.lock b/poetry.lock
index ca9cf31..c7a90ed 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -779,14 +779,14 @@ files = [
[[package]]
name = "click"
-version = "8.3.1"
+version = "8.2.1"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.10"
-groups = ["dev", "lint"]
+groups = ["dev", "docs", "lint"]
files = [
- {file = "click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6"},
- {file = "click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a"},
+ {file = "click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b"},
+ {file = "click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202"},
]
[package.dependencies]
@@ -798,12 +798,12 @@ version = "0.4.6"
description = "Cross-platform colored terminal text."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
-groups = ["dev", "lint", "test"]
+groups = ["dev", "docs", "lint", "test"]
files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
-markers = {lint = "platform_system == \"Windows\" or sys_platform == \"win32\""}
+markers = {docs = "platform_system == \"Windows\"", lint = "platform_system == \"Windows\" or sys_platform == \"win32\""}
[[package]]
name = "commitizen"
@@ -1024,14 +1024,14 @@ files = [
[[package]]
name = "cucumber-tag-expressions"
-version = "9.0.0"
+version = "9.1.0"
description = "Provides a tag-expression parser and evaluation logic for cucumber/behave"
optional = false
python-versions = ">=3.10"
groups = ["dev", "test"]
files = [
- {file = "cucumber_tag_expressions-9.0.0-py3-none-any.whl", hash = "sha256:36f3eacf49ad24feeb60218db4c51ab114853b3f022f4f3ad790c32b7597faee"},
- {file = "cucumber_tag_expressions-9.0.0.tar.gz", hash = "sha256:731302c12bd602309596b35e733c1021b517d4948329803c23ca026e26ef4e99"},
+ {file = "cucumber_tag_expressions-9.1.0-py3-none-any.whl", hash = "sha256:cca145d677a942c1877e5a2cf13da8c6ec99260988877c817efd284d8455bb56"},
+ {file = "cucumber_tag_expressions-9.1.0.tar.gz", hash = "sha256:d960383d5885300ebcbcb14e41657946fde2a59d5c0f485eb291bc6a0e228acc"},
]
[[package]]
@@ -1046,6 +1046,21 @@ files = [
{file = "decli-0.6.3.tar.gz", hash = "sha256:87f9d39361adf7f16b9ca6e3b614badf7519da13092f2db3c80ca223c53c7656"},
]
+[[package]]
+name = "deepmerge"
+version = "2.0"
+description = "A toolset for deeply merging Python dictionaries."
+optional = false
+python-versions = ">=3.8"
+groups = ["docs"]
+files = [
+ {file = "deepmerge-2.0-py3-none-any.whl", hash = "sha256:6de9ce507115cff0bed95ff0ce9ecc31088ef50cbdf09bc90a09349a318b3d00"},
+ {file = "deepmerge-2.0.tar.gz", hash = "sha256:5c3d86081fbebd04dd5de03626a0607b809a98fb6ccba5770b62466fe940ff20"},
+]
+
+[package.extras]
+dev = ["black", "build", "mypy", "pytest", "pyupgrade", "twine", "validate-pyproject[all]"]
+
[[package]]
name = "delta-spark"
version = "2.4.0"
@@ -1218,16 +1233,48 @@ python-dateutil = ">=2.4"
[[package]]
name = "filelock"
-version = "3.21.2"
+version = "3.24.3"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.10"
groups = ["dev"]
files = [
- {file = "filelock-3.21.2-py3-none-any.whl", hash = "sha256:d6cd4dbef3e1bb63bc16500fc5aa100f16e405bbff3fb4231711851be50c1560"},
- {file = "filelock-3.21.2.tar.gz", hash = "sha256:cfd218cfccf8b947fce7837da312ec3359d10ef2a47c8602edd59e0bacffb708"},
+ {file = "filelock-3.24.3-py3-none-any.whl", hash = "sha256:426e9a4660391f7f8a810d71b0555bce9008b0a1cc342ab1f6947d37639e002d"},
+ {file = "filelock-3.24.3.tar.gz", hash = "sha256:011a5644dc937c22699943ebbfc46e969cdde3e171470a6e40b9533e5a72affa"},
]
+[[package]]
+name = "ghp-import"
+version = "2.1.0"
+description = "Copy your docs directly to the gh-pages branch."
+optional = false
+python-versions = "*"
+groups = ["docs"]
+files = [
+ {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"},
+ {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"},
+]
+
+[package.dependencies]
+python-dateutil = ">=2.8.1"
+
+[package.extras]
+dev = ["flake8", "markdown", "twine", "wheel"]
+
+[[package]]
+name = "griffelib"
+version = "2.0.0"
+description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
+optional = false
+python-versions = ">=3.10"
+groups = ["docs"]
+files = [
+ {file = "griffelib-2.0.0-py3-none-any.whl", hash = "sha256:01284878c966508b6d6f1dbff9b6fa607bc062d8261c5c7253cb285b06422a7f"},
+]
+
+[package.extras]
+pypi = ["pip (>=24.0)", "platformdirs (>=4.2)", "wheel (>=0.42)"]
+
[[package]]
name = "identify"
version = "2.6.16"
@@ -1318,7 +1365,7 @@ version = "3.1.6"
description = "A very fast and expressive template engine."
optional = false
python-versions = ">=3.7"
-groups = ["main", "dev", "test"]
+groups = ["main", "dev", "docs", "test"]
files = [
{file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"},
{file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"},
@@ -1505,13 +1552,29 @@ html5 = ["html5lib"]
htmlsoup = ["BeautifulSoup4"]
source = ["Cython (==0.29.37)"]
+[[package]]
+name = "markdown"
+version = "3.10.2"
+description = "Python implementation of John Gruber's Markdown."
+optional = false
+python-versions = ">=3.10"
+groups = ["docs"]
+files = [
+ {file = "markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36"},
+ {file = "markdown-3.10.2.tar.gz", hash = "sha256:994d51325d25ad8aa7ce4ebaec003febcce822c3f8c911e3b17c52f7f589f950"},
+]
+
+[package.extras]
+docs = ["mdx_gh_links (>=0.2)", "mkdocs (>=1.6)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python] (>=0.28.3)"]
+testing = ["coverage", "pyyaml"]
+
[[package]]
name = "markupsafe"
version = "3.0.3"
description = "Safely add untrusted strings to HTML/XML markup."
optional = false
python-versions = ">=3.9"
-groups = ["main", "dev", "test"]
+groups = ["main", "dev", "docs", "test"]
files = [
{file = "markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559"},
{file = "markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419"},
@@ -1616,6 +1679,127 @@ files = [
{file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"},
]
+[[package]]
+name = "mergedeep"
+version = "1.3.4"
+description = "A deep merge function for 🐍."
+optional = false
+python-versions = ">=3.6"
+groups = ["docs"]
+files = [
+ {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"},
+ {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"},
+]
+
+[[package]]
+name = "mkdocs"
+version = "1.6.1"
+description = "Project documentation with Markdown."
+optional = false
+python-versions = ">=3.8"
+groups = ["docs"]
+files = [
+ {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"},
+ {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"},
+]
+
+[package.dependencies]
+click = ">=7.0"
+colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""}
+ghp-import = ">=1.0"
+jinja2 = ">=2.11.1"
+markdown = ">=3.3.6"
+markupsafe = ">=2.0.1"
+mergedeep = ">=1.3.4"
+mkdocs-get-deps = ">=0.2.0"
+packaging = ">=20.5"
+pathspec = ">=0.11.1"
+pyyaml = ">=5.1"
+pyyaml-env-tag = ">=0.1"
+watchdog = ">=2.0"
+
+[package.extras]
+i18n = ["babel (>=2.9.0)"]
+min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4) ; platform_system == \"Windows\"", "ghp-import (==1.0)", "importlib-metadata (==4.4) ; python_version < \"3.10\"", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
+
+[[package]]
+name = "mkdocs-autorefs"
+version = "1.4.4"
+description = "Automatically link across pages in MkDocs."
+optional = false
+python-versions = ">=3.9"
+groups = ["docs"]
+files = [
+ {file = "mkdocs_autorefs-1.4.4-py3-none-any.whl", hash = "sha256:834ef5408d827071ad1bc69e0f39704fa34c7fc05bc8e1c72b227dfdc5c76089"},
+ {file = "mkdocs_autorefs-1.4.4.tar.gz", hash = "sha256:d54a284f27a7346b9c38f1f852177940c222da508e66edc816a0fa55fc6da197"},
+]
+
+[package.dependencies]
+Markdown = ">=3.3"
+markupsafe = ">=2.0.1"
+mkdocs = ">=1.1"
+
+[[package]]
+name = "mkdocs-get-deps"
+version = "0.2.0"
+description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file"
+optional = false
+python-versions = ">=3.8"
+groups = ["docs"]
+files = [
+ {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"},
+ {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"},
+]
+
+[package.dependencies]
+mergedeep = ">=1.3.4"
+platformdirs = ">=2.2.0"
+pyyaml = ">=5.1"
+
+[[package]]
+name = "mkdocstrings"
+version = "1.0.3"
+description = "Automatic documentation from sources, for MkDocs."
+optional = false
+python-versions = ">=3.10"
+groups = ["docs"]
+files = [
+ {file = "mkdocstrings-1.0.3-py3-none-any.whl", hash = "sha256:0d66d18430c2201dc7fe85134277382baaa15e6b30979f3f3bdbabd6dbdb6046"},
+ {file = "mkdocstrings-1.0.3.tar.gz", hash = "sha256:ab670f55040722b49bb45865b2e93b824450fb4aef638b00d7acb493a9020434"},
+]
+
+[package.dependencies]
+Jinja2 = ">=3.1"
+Markdown = ">=3.6"
+MarkupSafe = ">=1.1"
+mkdocs = ">=1.6"
+mkdocs-autorefs = ">=1.4"
+mkdocstrings-python = {version = ">=1.16.2", optional = true, markers = "extra == \"python\""}
+pymdown-extensions = ">=6.3"
+
+[package.extras]
+crystal = ["mkdocstrings-crystal (>=0.3.4)"]
+python = ["mkdocstrings-python (>=1.16.2)"]
+python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
+
+[[package]]
+name = "mkdocstrings-python"
+version = "2.0.3"
+description = "A Python handler for mkdocstrings."
+optional = false
+python-versions = ">=3.10"
+groups = ["docs"]
+files = [
+ {file = "mkdocstrings_python-2.0.3-py3-none-any.whl", hash = "sha256:0b83513478bdfd803ff05aa43e9b1fca9dd22bcd9471f09ca6257f009bc5ee12"},
+ {file = "mkdocstrings_python-2.0.3.tar.gz", hash = "sha256:c518632751cc869439b31c9d3177678ad2bfa5c21b79b863956ad68fc92c13b8"},
+]
+
+[package.dependencies]
+griffelib = ">=2.0"
+mkdocs-autorefs = ">=1.4"
+mkdocstrings = ">=0.30"
+typing-extensions = {version = ">=4.0", markers = "python_version < \"3.11\""}
+
[[package]]
name = "moto"
version = "4.0.13"
@@ -1994,7 +2178,7 @@ version = "26.0"
description = "Core utilities for Python packages"
optional = false
python-versions = ">=3.8"
-groups = ["dev", "lint", "test"]
+groups = ["dev", "docs", "lint", "test"]
files = [
{file = "packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529"},
{file = "packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4"},
@@ -2113,14 +2297,14 @@ files = [
[[package]]
name = "parse"
-version = "1.21.0"
+version = "1.21.1"
description = "parse() is the opposite of format()"
optional = false
python-versions = "*"
groups = ["dev", "test"]
files = [
- {file = "parse-1.21.0-py2.py3-none-any.whl", hash = "sha256:6d81f7bae0ab25fd72818375c4a9c71c8705256bfc42e8725be609cf8b904aed"},
- {file = "parse-1.21.0.tar.gz", hash = "sha256:937725d51330ffec9c7a26fdb5623baa135d8ba8ed78817ea9523538844e3ce4"},
+ {file = "parse-1.21.1-py2.py3-none-any.whl", hash = "sha256:55339ca698019815df3b8e8b550e5933933527e623b0cdf1ca2f404da35ffb47"},
+ {file = "parse-1.21.1.tar.gz", hash = "sha256:825e1a88e9d9fb481b8d2ca709c6195558b6eaa97c559ad3a9a20aa2d12815a3"},
]
[[package]]
@@ -2150,7 +2334,7 @@ version = "1.0.4"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = ">=3.9"
-groups = ["dev", "lint"]
+groups = ["dev", "docs", "lint"]
files = [
{file = "pathspec-1.0.4-py3-none-any.whl", hash = "sha256:fb6ae2fd4e7c921a165808a552060e722767cfa526f99ca5156ed2ce45a5c723"},
{file = "pathspec-1.0.4.tar.gz", hash = "sha256:0210e2ae8a21a9137c0d470578cb0e595af87edaa6ebf12ff176f14a02e0e645"},
@@ -2164,14 +2348,14 @@ tests = ["pytest (>=9)", "typing-extensions (>=4.15)"]
[[package]]
name = "platformdirs"
-version = "4.7.0"
+version = "4.9.2"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
optional = false
python-versions = ">=3.10"
-groups = ["dev", "lint"]
+groups = ["dev", "docs", "lint"]
files = [
- {file = "platformdirs-4.7.0-py3-none-any.whl", hash = "sha256:1ed8db354e344c5bb6039cd727f096af975194b508e37177719d562b2b540ee6"},
- {file = "platformdirs-4.7.0.tar.gz", hash = "sha256:fd1a5f8599c85d49b9ac7d6e450bc2f1aaf4a23f1fe86d09952fe20ad365cf36"},
+ {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"},
+ {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"},
]
[[package]]
@@ -2400,7 +2584,7 @@ version = "2.19.2"
description = "Pygments is a syntax highlighting package written in Python."
optional = false
python-versions = ">=3.8"
-groups = ["dev", "test"]
+groups = ["dev", "docs", "test"]
files = [
{file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
{file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
@@ -2438,6 +2622,25 @@ tomlkit = ">=0.10.1"
spelling = ["pyenchant (>=3.2,<4.0)"]
testutils = ["gitpython (>3)"]
+[[package]]
+name = "pymdown-extensions"
+version = "10.21"
+description = "Extension pack for Python Markdown."
+optional = false
+python-versions = ">=3.9"
+groups = ["docs"]
+files = [
+ {file = "pymdown_extensions-10.21-py3-none-any.whl", hash = "sha256:91b879f9f864d49794c2d9534372b10150e6141096c3908a455e45ca72ad9d3f"},
+ {file = "pymdown_extensions-10.21.tar.gz", hash = "sha256:39f4a020f40773f6b2ff31d2cd2546c2c04d0a6498c31d9c688d2be07e1767d5"},
+]
+
+[package.dependencies]
+markdown = ">=3.6"
+pyyaml = "*"
+
+[package.extras]
+extra = ["pygments (>=2.19.1)"]
+
[[package]]
name = "pyspark"
version = "3.4.4"
@@ -2504,7 +2707,7 @@ version = "2.9.0.post0"
description = "Extensions to the standard Python datetime module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-groups = ["main", "dev", "test"]
+groups = ["main", "dev", "docs", "test"]
files = [
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
@@ -2531,15 +2734,8 @@ version = "6.0.3"
description = "YAML parser and emitter for Python"
optional = false
python-versions = ">=3.8"
-groups = ["dev", "test"]
+groups = ["dev", "docs", "test"]
files = [
- {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
- {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
- {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efd7b85f94a6f21e4932043973a7ba2613b059c4a000551892ac9f1d11f5baf3"},
- {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:22ba7cfcad58ef3ecddc7ed1db3409af68d023b7f940da23c6c2a1890976eda6"},
- {file = "PyYAML-6.0.3-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:6344df0d5755a2c9a276d4473ae6b90647e216ab4757f8426893b5dd2ac3f369"},
- {file = "PyYAML-6.0.3-cp38-cp38-win32.whl", hash = "sha256:3ff07ec89bae51176c0549bc4c63aa6202991da2d9a6129d7aef7f1407d3f295"},
- {file = "PyYAML-6.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:5cf4e27da7e3fbed4d6c3d8e797387aaad68102272f8f9752883bc32d61cb87b"},
{file = "pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b"},
{file = "pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956"},
{file = "pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8"},
@@ -2608,6 +2804,21 @@ files = [
{file = "pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f"},
]
+[[package]]
+name = "pyyaml-env-tag"
+version = "1.1"
+description = "A custom YAML tag for referencing environment variables in YAML files."
+optional = false
+python-versions = ">=3.9"
+groups = ["docs"]
+files = [
+ {file = "pyyaml_env_tag-1.1-py3-none-any.whl", hash = "sha256:17109e1a528561e32f026364712fee1264bc2ea6715120891174ed1b980d2e04"},
+ {file = "pyyaml_env_tag-1.1.tar.gz", hash = "sha256:2eb38b75a2d21ee0475d6d97ec19c63287a7e140231e4214969d0eac923cd7ff"},
+]
+
+[package.dependencies]
+pyyaml = "*"
+
[[package]]
name = "questionary"
version = "2.1.1"
@@ -2647,14 +2858,14 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "responses"
-version = "0.25.8"
+version = "0.26.0"
description = "A utility library for mocking out the `requests` Python library."
optional = false
python-versions = ">=3.8"
groups = ["dev", "test"]
files = [
- {file = "responses-0.25.8-py3-none-any.whl", hash = "sha256:0c710af92def29c8352ceadff0c3fe340ace27cf5af1bbe46fb71275bcd2831c"},
- {file = "responses-0.25.8.tar.gz", hash = "sha256:9374d047a575c8f781b94454db5cab590b6029505f488d12899ddb10a4af1cf4"},
+ {file = "responses-0.26.0-py3-none-any.whl", hash = "sha256:03ec4409088cd5c66b71ecbbbd27fe2c58ddfad801c66203457b3e6a04868c37"},
+ {file = "responses-0.26.0.tar.gz", hash = "sha256:c7f6923e6343ef3682816ba421c006626777893cb0d5e1434f674b649bac9eb4"},
]
[package.dependencies]
@@ -2689,7 +2900,7 @@ version = "1.17.0"
description = "Python 2 and 3 compatibility utilities"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
-groups = ["main", "dev", "test"]
+groups = ["main", "dev", "docs", "test"]
files = [
{file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"},
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
@@ -2716,7 +2927,7 @@ version = "2.4.0"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
-groups = ["dev", "lint", "test"]
+groups = ["dev", "docs", "lint", "test"]
markers = "python_version == \"3.10\""
files = [
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
@@ -2897,12 +3108,12 @@ version = "4.15.0"
description = "Backported and Experimental Type Hints for Python 3.9+"
optional = false
python-versions = ">=3.9"
-groups = ["main", "dev", "lint", "test"]
+groups = ["main", "dev", "docs", "lint", "test"]
files = [
{file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"},
{file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"},
]
-markers = {test = "python_version == \"3.10\""}
+markers = {docs = "python_version == \"3.10\"", test = "python_version == \"3.10\""}
[[package]]
name = "tzdata"
@@ -2936,25 +3147,68 @@ zstd = ["backports-zstd (>=1.0.0) ; python_version < \"3.14\""]
[[package]]
name = "virtualenv"
-version = "20.36.1"
+version = "20.38.0"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.8"
groups = ["dev"]
files = [
- {file = "virtualenv-20.36.1-py3-none-any.whl", hash = "sha256:575a8d6b124ef88f6f51d56d656132389f961062a9177016a50e4f507bbcc19f"},
- {file = "virtualenv-20.36.1.tar.gz", hash = "sha256:8befb5c81842c641f8ee658481e42641c68b5eab3521d8e092d18320902466ba"},
+ {file = "virtualenv-20.38.0-py3-none-any.whl", hash = "sha256:d6e78e5889de3a4742df2d3d44e779366325a90cf356f15621fddace82431794"},
+ {file = "virtualenv-20.38.0.tar.gz", hash = "sha256:94f39b1abaea5185bf7ea5a46702b56f1d0c9aa2f41a6c2b8b0af4ddc74c10a7"},
]
[package.dependencies]
distlib = ">=0.3.7,<1"
-filelock = {version = ">=3.20.1,<4", markers = "python_version >= \"3.10\""}
+filelock = {version = ">=3.24.2,<4", markers = "python_version >= \"3.10\""}
platformdirs = ">=3.9.1,<5"
typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\""}
[package.extras]
-docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
-test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
+docs = ["furo (>=2023.7.26)", "pre-commit-uv (>=4.1.4)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinx-autodoc-typehints (>=3.6.2)", "sphinx-copybutton (>=0.5.2)", "sphinx-inline-tabs (>=2025.12.21.14)", "sphinxcontrib-mermaid (>=2)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "pytest-xdist (>=3.5)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
+
+[[package]]
+name = "watchdog"
+version = "6.0.0"
+description = "Filesystem events monitoring"
+optional = false
+python-versions = ">=3.9"
+groups = ["docs"]
+files = [
+ {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d1cdb490583ebd691c012b3d6dae011000fe42edb7a82ece80965b42abd61f26"},
+ {file = "watchdog-6.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bc64ab3bdb6a04d69d4023b29422170b74681784ffb9463ed4870cf2f3e66112"},
+ {file = "watchdog-6.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c897ac1b55c5a1461e16dae288d22bb2e412ba9807df8397a635d88f671d36c3"},
+ {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6eb11feb5a0d452ee41f824e271ca311a09e250441c262ca2fd7ebcf2461a06c"},
+ {file = "watchdog-6.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ef810fbf7b781a5a593894e4f439773830bdecb885e6880d957d5b9382a960d2"},
+ {file = "watchdog-6.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afd0fe1b2270917c5e23c2a65ce50c2a4abb63daafb0d419fde368e272a76b7c"},
+ {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:bdd4e6f14b8b18c334febb9c4425a878a2ac20efd1e0b231978e7b150f92a948"},
+ {file = "watchdog-6.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c7c15dda13c4eb00d6fb6fc508b3c0ed88b9d5d374056b239c4ad1611125c860"},
+ {file = "watchdog-6.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6f10cb2d5902447c7d0da897e2c6768bca89174d0c6e1e30abec5421af97a5b0"},
+ {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:490ab2ef84f11129844c23fb14ecf30ef3d8a6abafd3754a6f75ca1e6654136c"},
+ {file = "watchdog-6.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:76aae96b00ae814b181bb25b1b98076d5fc84e8a53cd8885a318b42b6d3a5134"},
+ {file = "watchdog-6.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a175f755fc2279e0b7312c0035d52e27211a5bc39719dd529625b1930917345b"},
+ {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e6f0e77c9417e7cd62af82529b10563db3423625c5fce018430b249bf977f9e8"},
+ {file = "watchdog-6.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:90c8e78f3b94014f7aaae121e6b909674df5b46ec24d6bebc45c44c56729af2a"},
+ {file = "watchdog-6.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e7631a77ffb1f7d2eefa4445ebbee491c720a5661ddf6df3498ebecae5ed375c"},
+ {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c7ac31a19f4545dd92fc25d200694098f42c9a8e391bc00bdd362c5736dbf881"},
+ {file = "watchdog-6.0.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9513f27a1a582d9808cf21a07dae516f0fab1cf2d7683a742c498b93eedabb11"},
+ {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7a0e56874cfbc4b9b05c60c8a1926fedf56324bb08cfbc188969777940aef3aa"},
+ {file = "watchdog-6.0.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:e6439e374fc012255b4ec786ae3c4bc838cd7309a540e5fe0952d03687d8804e"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7607498efa04a3542ae3e05e64da8202e58159aa1fa4acddf7678d34a35d4f13"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:9041567ee8953024c83343288ccc458fd0a2d811d6a0fd68c4c22609e3490379"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:82dc3e3143c7e38ec49d61af98d6558288c415eac98486a5c581726e0737c00e"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:212ac9b8bf1161dc91bd09c048048a95ca3a4c4f5e5d4a7d1b1a7d5752a7f96f"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:e3df4cbb9a450c6d49318f6d14f4bbc80d763fa587ba46ec86f99f9e6876bb26"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:2cce7cfc2008eb51feb6aab51251fd79b85d9894e98ba847408f662b3395ca3c"},
+ {file = "watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:20ffe5b202af80ab4266dcd3e91aae72bf2da48c0d33bdb15c66658e685e94e2"},
+ {file = "watchdog-6.0.0-py3-none-win32.whl", hash = "sha256:07df1fdd701c5d4c8e55ef6cf55b8f0120fe1aef7ef39a1c6fc6bc2e606d517a"},
+ {file = "watchdog-6.0.0-py3-none-win_amd64.whl", hash = "sha256:cbafb470cf848d93b5d013e2ecb245d4aa1c8fd0504e863ccefa32445359d680"},
+ {file = "watchdog-6.0.0-py3-none-win_ia64.whl", hash = "sha256:a1914259fa9e1454315171103c6a30961236f508b9b623eae470268bbcc6a22f"},
+ {file = "watchdog-6.0.0.tar.gz", hash = "sha256:9ddf7c82fda3ae8e24decda1338ede66e1c99883db93711d8fb941eaa2d8c282"},
+]
+
+[package.extras]
+watchmedo = ["PyYAML (>=3.10)"]
[[package]]
name = "wcwidth"
@@ -3091,19 +3345,51 @@ files = [
[[package]]
name = "xmltodict"
-version = "1.0.2"
+version = "1.0.4"
description = "Makes working with XML feel like you are working with JSON"
optional = false
python-versions = ">=3.9"
groups = ["dev", "test"]
files = [
- {file = "xmltodict-1.0.2-py3-none-any.whl", hash = "sha256:62d0fddb0dcbc9f642745d8bbf4d81fd17d6dfaec5a15b5c1876300aad92af0d"},
- {file = "xmltodict-1.0.2.tar.gz", hash = "sha256:54306780b7c2175a3967cad1db92f218207e5bc1aba697d887807c0fb68b7649"},
+ {file = "xmltodict-1.0.4-py3-none-any.whl", hash = "sha256:a4a00d300b0e1c59fc2bfccb53d7b2e88c32f200df138a0dd2229f842497026a"},
+ {file = "xmltodict-1.0.4.tar.gz", hash = "sha256:6d94c9f834dd9e44514162799d344d815a3a4faec913717a9ecbfa5be1bb8e61"},
]
[package.extras]
test = ["pytest", "pytest-cov"]
+[[package]]
+name = "zensical"
+version = "0.0.23"
+description = "A modern static site generator built by the creators of Material for MkDocs"
+optional = false
+python-versions = ">=3.10"
+groups = ["docs"]
+files = [
+ {file = "zensical-0.0.23-cp310-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35d6d3eb803fe73a67187a1a25443408bd02a8dd50e151f4a4bafd40de3f0928"},
+ {file = "zensical-0.0.23-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:5973267460a190f348f24d445ff0c01e8ed334fd075947687b305e68257f6b18"},
+ {file = "zensical-0.0.23-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:953adf1f0b346a6c65fc6e05e6cc1c38a6440fec29c50c76fb29700cc1927006"},
+ {file = "zensical-0.0.23-cp310-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:49c1cbd6131dafa056be828e081759184f9b8dd24b99bf38d1e77c8c31b0c720"},
+ {file = "zensical-0.0.23-cp310-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f5b7fe22c5d33b2b91899c5df7631ad4ce9cccfabac2560cc92ba73eafe2d297"},
+ {file = "zensical-0.0.23-cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a3679d6bf6374f503afb74d9f6061da5de83c25922f618042b63a30b16f0389"},
+ {file = "zensical-0.0.23-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:54d981e21a19c3dcec6e7fa77c4421db47389dfdff20d29fea70df8e1be4062e"},
+ {file = "zensical-0.0.23-cp310-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:afde7865cc3c79c99f6df4a911d638fb2c3b472a1b81367d47163f8e3c36f910"},
+ {file = "zensical-0.0.23-cp310-abi3-musllinux_1_2_i686.whl", hash = "sha256:c484674d7b0a3e6d39db83914db932249bccdef2efaf8a5669671c66c16f584d"},
+ {file = "zensical-0.0.23-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:927d12fe2851f355fb3206809e04641d6651bdd2ff4afe9c205721aa3a32aa82"},
+ {file = "zensical-0.0.23-cp310-abi3-win32.whl", hash = "sha256:ffb79db4244324e9cc063d16adff25a40b145153e5e76d75e0012ba3c05af25d"},
+ {file = "zensical-0.0.23-cp310-abi3-win_amd64.whl", hash = "sha256:a8cfe240dca75231e8e525985366d010d09ee73aec0937930e88f7230694ce01"},
+ {file = "zensical-0.0.23.tar.gz", hash = "sha256:5c4fc3aaf075df99d8cf41b9f2566e4d588180d9a89493014d3607dfe50ac4bc"},
+]
+
+[package.dependencies]
+click = ">=8.1.8"
+deepmerge = ">=2.0"
+markdown = ">=3.7"
+pygments = ">=2.16"
+pymdown-extensions = ">=10.15"
+pyyaml = ">=6.0.2"
+tomli = {version = ">=2.0", markers = "python_full_version < \"3.11.0\""}
+
[[package]]
name = "zipp"
version = "3.23.0"
@@ -3127,4 +3413,4 @@ type = ["pytest-mypy"]
[metadata]
lock-version = "2.1"
python-versions = ">=3.10,<3.12"
-content-hash = "0b0b9c546709542f314418c15b0c6151803a006891b74808fd80b1f86ff28d94"
+content-hash = "d83c9a1871a248d0bbad6811b26e87816863cb95600b8cd9ebc2bed33cd854d9"
diff --git a/pyproject.toml b/pyproject.toml
index e338b23..7a0cda6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,15 @@ types-setuptools = "68.2.0.0"
types-urllib3 = "1.26.25.14"
types-xmltodict = "0.13.0.3"
+[tool.poetry.group.docs]
+optional = true
+
+[tool.poetry.group.docs.dependencies]
+click = "8.2.1"
+mkdocs = "^1.6.1"
+mkdocstrings = { version = "^1.0.3", extras = ["python"] }
+zensical = "~=0.0.23"
+
[tool.ruff]
line-length = 100
diff --git a/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py b/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py
index 2018010..98b8cb8 100644
--- a/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py
+++ b/src/dve/core_engine/backends/implementations/duckdb/readers/csv.py
@@ -202,6 +202,7 @@ class DuckDBCSVRepeatingHeaderReader(PolarsToDuckDBCSVReader):
`NonDistinctHeaderError`.
So using the example above, the expected entity would look like this...
+
| headerCol1 | headerCol2 | headerCol3 |
| ---------- | ---------- | ---------- |
| shop1 | clothes | 2025-01-01 |
diff --git a/zensical.toml b/zensical.toml
new file mode 100644
index 0000000..255a771
--- /dev/null
+++ b/zensical.toml
@@ -0,0 +1,214 @@
+[project]
+site_name = "Data Validation Engine"
+site_description = "Documentation for using the Data Validation Engine (DVE)."
+site_author = "NHS England"
+site_url = "https://nhsdigital.github.io/data-validation-engine/"
+copyright = """
+
+"""
+nav = [
+ {"User Guidance" = [
+ "index.md",
+ {"Installation" = "user_guidance/install.md"},
+ {"Getting Started" = "user_guidance/getting_started.md"},
+ {"Auditing" = "user_guidance/auditing.md"},
+ {"Creating a Dischema" = [
+ {"File Transformation" = "user_guidance/file_transformation.md"},
+ {"Data Contract" = "user_guidance/data_contract.md"},
+ {"Business Rules" = "user_guidance/business_rules.md"},
+ ]},
+ {"Backend Implementations" = [
+ {"DuckDB" = "user_guidance/implementations/duckdb.md"},
+ {"Spark" = "user_guidance/implementations/spark.md"},
+ {"Platform Specific Implementations" = [
+ {"Databricks" = "user_guidance/implementations/platform_specific/databricks.md"},
+ {"Palantir Foundry" = "user_guidance/implementations/platform_specific/palantir_foundry.md"},
+ ]},
+ ]},
+ {"Reporting" = [
+ {"Feedback Messages" = "user_guidance/feedback_messages.md"},
+ {"Error Reports" = "user_guidance/error_reports.md"},
+ ]}
+ ]},
+ {"Advanced User Guidance" = [
+ "advanced_guidance/index.md",
+ {"DVE Package Documentation" = [
+ "advanced_guidance/package_documentation/index.md",
+ {"Pipeline" = "advanced_guidance/package_documentation/pipeline.md"},
+ {"Auditing" = "advanced_guidance/package_documentation/auditing.md"},
+ {"Data Contract" = [
+ {"Readers" = "advanced_guidance/package_documentation/readers.md"},
+ {"Domain Types" = "advanced_guidance/package_documentation/domain_types.md"},
+ ]},
+ {"Business Rules" = [
+ {"Rules" = [
+ {"Operations" = "advanced_guidance/package_documentation/operations.md"},
+ ]},
+ {"Refdata" = [
+ {"Refdata Types" = "advanced_guidance/package_documentation/refence_data_types.md"},
+ {"Refdata Loaders" = "advanced_guidance/package_documentation/refdata_loaders.md"},
+ ]}
+ ]},
+ {"Feedback" = [
+ {"Feedback Messages" = "advanced_guidance/package_documentation/feedback_messages.md"},
+ ]},
+ {"Models" = "advanced_guidance/package_documentation/models.md"},
+ ]},
+ {"DVE Developer Guidance" = [
+ {"Implementing a new backend" = "advanced_guidance/new_backend.md"},
+ {"Dischema Language Server" = "advanced_guidance/json_schemas.md"},
+ ]},
+ ]}
+]
+extra_css = ["assets/stylesheets/extra.css"]
+# extra_javascript = ["assets/javascript/extra.js"]
+repo_url = "https://github.com/NHSDigital/data-validation-engine"
+repo_name = "Data Validation Engine"
+
+# ----------------------------------------------------------------------------
+# Section for configuring theme options
+# ----------------------------------------------------------------------------
+[project.theme]
+variant = "classic"
+custom_dir = "overrides"
+logo = "assets/images/favicon.svg"
+favicon = "assets/images/favicon.ico"
+language = "en"
+features = [
+ "content.action.edit",
+ "content.code.annotate",
+ "content.code.copy",
+ "content.code.select",
+ # "content.footnote.tooltips",
+ "content.tabs.link",
+ # "content.tooltips",
+ # "header.autohide",
+ # "navigation.expand",
+ "navigation.footer",
+ "navigation.indexes",
+ "navigation.instant",
+ "navigation.instant.prefetch",
+ "navigation.instant.preview",
+ "navigation.instant.progress",
+ "navigation.path",
+ #"navigation.prune",
+ "navigation.sections",
+ "navigation.tabs",
+ #"navigation.tabs.sticky",
+ "navigation.top",
+ # "navigation.tracking",
+ "search.highlight",
+ "toc.follow",
+ "toc.integrate",
+]
+
+# ----------------------------------------------------------------------------
+# In the "palette" subsection you can configure options for the color scheme.
+# You can configure different color # schemes, e.g., to turn on dark mode,
+# that the user can switch between. Each color scheme can be further
+# customized.
+#
+# Read more:
+# - https://zensical.org/docs/setup/colors/
+# ----------------------------------------------------------------------------
+[[project.theme.palette]]
+media = "(prefers-color-scheme)"
+toggle.icon = "material/brightness-auto"
+toggle.name = "Switch to light mode"
+
+[[project.theme.palette]]
+media = "(prefers-color-scheme: light)"
+scheme = "default"
+toggle.icon = "material/brightness-7"
+toggle.name = "Switch to dark mode"
+
+[[project.theme.palette]]
+media = "(prefers-color-scheme: dark)"
+scheme = "slate"
+toggle.icon = "material/brightness-4"
+toggle.name = "Switch to system preference"
+
+# ----------------------------------------------------------------------------
+# In the "font" subsection you can configure the fonts used. By default, fonts
+# are loaded from Google Fonts, giving you a wide range of choices from a set
+# of suitably licensed fonts. There are options for a normal text font and for
+# a monospaced font used in code blocks.
+# ----------------------------------------------------------------------------
+[project.theme.font]
+text = "Inter"
+code = "Jetbrains Mono"
+
+# ----------------------------------------------------------------------------
+# The "extra" section contains miscellaneous settings.
+# ----------------------------------------------------------------------------
+
+[project.extra.consent]
+title = "Cookie consent"
+description = """
+ We use cookies to recognize your repeated visits and preferences, as well
+ as to measure the effectiveness of our documentation and whether users
+ find what they're searching for. With your consent, you're helping us to
+ make our documentation better.
+"""
+
+[[project.extra.social]]
+icon = "nhseng"
+link = "https://www.england.nhs.uk/"
+name = "NHS England Website"
+
+[[project.extra.social]]
+icon = "fontawesome/brands/github"
+link = "https://github.com/NHSDigital"
+name = "NHS Digital GitHub"
+
+# ----------------------------------------------------------------------------
+# Markdown Extensions
+# ----------------------------------------------------------------------------
+
+[project.markdown_extensions.abbr]
+[project.markdown_extensions.admonition]
+[project.markdown_extensions.attr_list]
+[project.markdown_extensions.md_in_html]
+[project.markdown_extensions.pymdownx.details]
+
+[project.markdown_extensions.pymdownx.emoji]
+emoji_index = "zensical.extensions.emoji.twemoji"
+emoji_generator = "zensical.extensions.emoji.to_svg"
+options.custom_icons = ["overrides/.icons"]
+
+[project.markdown_extensions.pymdownx.highlight]
+[project.markdown_extensions.pymdownx.inlinehilite]
+
+[project.markdown_extensions.pymdownx.snippets]
+auto_append = ["includes/jargon_and_acronyms.md"]
+
+[project.markdown_extensions.pymdownx.superfences]
+
+[project.markdown_extensions.pymdownx.tabbed]
+alternate_style = true
+
+[project.markdown_extensions.pymdownx.tabbed.slugify]
+object = "pymdownx.slugs.slugify"
+kwds = { case = "lower" }
+
+[project.markdown_extensions.toc]
+permalink = true
+
+[project.markdown_extensions.zensical.extensions.preview]
+
+# ----------------------------------------------------------------------------
+# Plugins
+# ----------------------------------------------------------------------------
+
+[project.plugins.mkdocstrings.handlers.python]
+paths = ["src/dve"]
+inventories = ["https://docs.python.org/3/objects.inv"]