From 0daaf792b346bee2675d2bd2e3f9f8cea4b7b453 Mon Sep 17 00:00:00 2001 From: ssjunnebo Date: Tue, 31 Mar 2026 10:11:32 +0200 Subject: [PATCH 01/10] Replace regexes with central version --- dataflow_transfer/run_classes/element_runs.py | 5 ++-- dataflow_transfer/run_classes/generic_runs.py | 22 ++++++++++++++ .../run_classes/illumina_runs.py | 12 ++------ dataflow_transfer/run_classes/ont_runs.py | 4 +-- dataflow_transfer/utils/statusdb.py | 30 ++++++++++++++++++- 5 files changed, 57 insertions(+), 16 deletions(-) diff --git a/dataflow_transfer/run_classes/element_runs.py b/dataflow_transfer/run_classes/element_runs.py index 2b543e2..aaa8a06 100644 --- a/dataflow_transfer/run_classes/element_runs.py +++ b/dataflow_transfer/run_classes/element_runs.py @@ -6,6 +6,8 @@ class ElementRun(Run): """Defines an Element sequencing run""" + run_family = "Element" + def __init__(self, run_dir, configuration): super().__init__(run_dir, configuration) self.final_file = "RunUploaded.json" @@ -18,9 +20,6 @@ class AVITIRun(ElementRun): run_type = "AVITI" def __init__(self, run_dir, configuration): - self.run_id_format = ( - r"^\d{8}_AV\d{6}_(A|B)\d{10}$" # 20251007_AV242106_A2507535225 - ) super().__init__(run_dir, configuration) self.flowcell_id = self.run_id.split("_")[-1][1:] # 2507535225 diff --git a/dataflow_transfer/run_classes/generic_runs.py b/dataflow_transfer/run_classes/generic_runs.py index 14e4576..61311af 100644 --- a/dataflow_transfer/run_classes/generic_runs.py +++ b/dataflow_transfer/run_classes/generic_runs.py @@ -12,6 +12,10 @@ class Run: """Defines a generic sequencing run""" + run_type = None + run_family = None + default_run_id_format = None + def __init__(self, run_dir, configuration): self.run_dir = run_dir self.run_id = os.path.basename(run_dir) @@ -33,6 +37,24 @@ def __init__(self, run_dir, configuration): ) self.remote_destination = self.sequencer_config.get("remote_destination") self.db = StatusdbSession(self.configuration.get("statusdb")) + self.run_id_format = self._resolve_run_id_format() + + def _resolve_run_id_format(self): + """Resolve the run ID regex from central config.""" + run_id_format = None + if self.run_family and self.run_type: + try: + run_id_format = self.db.get_regex_pattern( + self.run_family, self.run_type + ) + except Exception as exc: + logger.warning( + "Unable to load run_id_format for %s from regex config: %s", + self.run_type, + exc, + ) + + return run_id_format def confirm_run_type(self): """Compare run ID with expected format for the run type.""" diff --git a/dataflow_transfer/run_classes/illumina_runs.py b/dataflow_transfer/run_classes/illumina_runs.py index 12cfa11..bf3663f 100644 --- a/dataflow_transfer/run_classes/illumina_runs.py +++ b/dataflow_transfer/run_classes/illumina_runs.py @@ -6,6 +6,8 @@ class IlluminaRun(Run): """Defines an Illumina sequencing run""" + run_family = "Illumina" + def __init__(self, run_dir, configuration): super().__init__(run_dir, configuration) self.final_file = "CopyComplete.txt" @@ -19,9 +21,6 @@ class NovaSeqXPlusRun(IlluminaRun): run_type = "NovaSeqXPlus" def __init__(self, run_dir, configuration): - self.run_id_format = ( - r"^\d{8}_[A-Z0-9]+_\d{4}_[A-Z0-9]+$" # 20251010_LH00202_0284_B22CVHTLT1 - ) super().__init__(run_dir, configuration) self.flowcell_id = self.run_id.split("_")[-1][1:] # 22CVHTLT1 @@ -33,9 +32,6 @@ class NextSeqRun(IlluminaRun): run_type = "NextSeq" def __init__(self, run_dir, configuration): - self.run_id_format = ( - r"^\d{6}_[A-Z0-9]+_\d{3}_[A-Z0-9]+$" # 251015_VH00203_572_AAHFHCCM5 - ) super().__init__(run_dir, configuration) @@ -46,9 +42,6 @@ class MiSeqRun(IlluminaRun): run_type = "MiSeq" def __init__(self, run_dir, configuration): - self.run_id_format = ( - r"^\d{6}_[A-Z0-9]+_\d{4}_[A-Z0-9\-]+$" # 251015_M01548_0646_000000000-M6D7K - ) super().__init__(run_dir, configuration) @@ -59,6 +52,5 @@ class MiSeqi100Run(IlluminaRun): run_type = "MiSeqi100" def __init__(self, run_dir, configuration): - self.run_id_format = r"^\d{8}_[A-Z0-9]+_\d{4}_[A-Z0-9]{10}-SC3$" # 20260128_SH01140_0002_ASC2150561-SC3 super().__init__(run_dir, configuration) self.flowcell_id = self.run_id.split("_")[-1][1:] # SC2150561-SC3 diff --git a/dataflow_transfer/run_classes/ont_runs.py b/dataflow_transfer/run_classes/ont_runs.py index 243b685..a9c74c9 100644 --- a/dataflow_transfer/run_classes/ont_runs.py +++ b/dataflow_transfer/run_classes/ont_runs.py @@ -6,6 +6,8 @@ class ONTRun(Run): """Defines a ONT sequencing run""" + run_family = "ONT" + def __init__(self, run_dir, configuration): super().__init__(run_dir, configuration) self.final_file = "final_summary.txt" @@ -19,7 +21,6 @@ class PromethIONRun(ONTRun): run_type = "PromethION" def __init__(self, run_dir, configuration): - self.run_id_format = r"^\d{8}_\d{4}_[A-Z0-9]{2}_P[A-Z0-9]+_[a-f0-9]{8}$" # 20251015_1051_3B_PBG60686_0af3a2e0 super().__init__(run_dir, configuration) @@ -30,5 +31,4 @@ class MinIONRun(ONTRun): run_type = "MinION" def __init__(self, run_dir, configuration): - self.run_id_format = r"^\d{8}_\d{4}_MN[A-Z0-9]+_[A-Z0-9]+_[a-f0-9]{8}$" # 20240229_1404_MN19414_ASH657_7a74bf8f super().__init__(run_dir, configuration) diff --git a/dataflow_transfer/utils/statusdb.py b/dataflow_transfer/utils/statusdb.py index ecfc1a4..01b7d6e 100644 --- a/dataflow_transfer/utils/statusdb.py +++ b/dataflow_transfer/utils/statusdb.py @@ -57,7 +57,9 @@ def _retry_call(self, func): def get_db_doc(self, ddoc, view, run_id): """Retrieve a document from the database via retried call.""" - doc_id = self.get_doc_id(ddoc, view, run_id) + doc_id = self.get_doc_id( + ddoc, view, run_id + ) # TODO: refactor to use get_document if doc_id: return self._retry_call( lambda: self.connection.get_document( @@ -66,6 +68,32 @@ def get_db_doc(self, ddoc, view, run_id): ) return None + def get_document(self, db, doc_id): + """Retrieve a document from any database via retried call.""" + return self._retry_call( + lambda: self.connection.get_document(db=db, doc_id=doc_id).get_result() + ) + + def get_regex_pattern( + self, + run_family, + run_type, + regex_db="gs_configs", + regex_doc_id="regex_patterns", + ): + """Lookup the python regex pattern for a run type from the central regex config document.""" + regex_doc = self.get_document(db=regex_db, doc_id=regex_doc_id) + if not regex_doc: + return None + + flowcell_patterns = regex_doc.get("flowcell_patterns", {}) + family_patterns = flowcell_patterns.get(run_family, {}) + if not family_patterns: + return None + + pattern = family_patterns.get(run_type) + return pattern + def get_doc_id(self, ddoc, view, run_id): """Retrieve a document ID from the database via retried call.""" result = self._retry_call( From b1532684cd8e938426c36d80f7fc2e07de52c7f3 Mon Sep 17 00:00:00 2001 From: ssjunnebo Date: Tue, 31 Mar 2026 11:32:03 +0200 Subject: [PATCH 02/10] Fix tests --- dataflow_transfer/tests/test_run_classes.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/dataflow_transfer/tests/test_run_classes.py b/dataflow_transfer/tests/test_run_classes.py index 25a5d31..f7773b9 100644 --- a/dataflow_transfer/tests/test_run_classes.py +++ b/dataflow_transfer/tests/test_run_classes.py @@ -140,6 +140,26 @@ def __init__(self, config): def get_db_doc(self, ddoc, view, run_id): return None + def get_regex_pattern(self, run_family, run_type): + if run_family == "Illumina": + if run_type == "NovaSeqXPlus": + return r"^(?P\d{8})_(?P[A-Z0-9]+)_\d{4}_(?P(A|B))(?P[A-Z0-9]+)$" + elif run_type == "NextSeq": + return r"^(?P\d{6})_(?P[A-Z0-9]+)_\d{3}_(?P(A|B))(?P[A-Z0-9]+)$" + elif run_type == "MiSeq": + return r"^(?P\d{6})_(?P[A-Z0-9]+)_\d{4}_(?P[A-Z0-9\-]+)$" + elif run_type == "MiSeqi100": + return r"^(?P\d{8})_(?P[A-Z0-9]+)_\d{4}_A(?P[A-Z0-9]{9}-SC3)$" + elif run_family == "ONT": + if run_type == "PromethION": + return r"^(?P\d{8})_(?P