diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py index e74a481262..4b3d457ae8 100644 --- a/tools/agentic_import/pvmap_generator.py +++ b/tools/agentic_import/pvmap_generator.py @@ -22,7 +22,7 @@ import subprocess import sys from datetime import datetime -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path from typing import List, Optional @@ -44,6 +44,11 @@ def _define_flags(): flags.DEFINE_list('input_metadata', [], 'List of input metadata file paths (optional)') + flags.DEFINE_list( + 'extra_instruction_files', [], + 'List of extra instruction file paths to make available to Gemini ' + '(optional)') + flags.DEFINE_boolean( 'sdmx_dataset', False, 'Whether the dataset is in SDMX format (default: False)') @@ -86,6 +91,11 @@ def _define_flags(): flags.DEFINE_string( 'working_dir', None, 'Working directory for the generator (default: current directory)') + + flags.DEFINE_integer( + 'extra_instruction_max_bytes', 65536, + 'Maximum allowed size in bytes for each extra instruction file ' + '(default: 65536)') except flags.DuplicateFlagError: pass @@ -110,6 +120,8 @@ class Config: output_path: str = 'output/output' gemini_cli: Optional[str] = None working_dir: Optional[str] = None + extra_instruction_files: List[str] = field(default_factory=list) + extra_instruction_max_bytes: int = 65536 @dataclass @@ -137,6 +149,8 @@ def __init__(self, config: Config): # Copy config to avoid modifying the original self._config = copy.deepcopy(config) + self._validate_extra_instruction_max_bytes() + # Convert input_data paths to absolute if self._config.data_config.input_data: self._config.data_config.input_data = [ @@ -151,6 +165,13 @@ def __init__(self, config: Config): for path in self._config.data_config.input_metadata ] + # Resolve and validate runtime extra instruction files. + if self._config.extra_instruction_files: + self._config.extra_instruction_files = [ + self._validate_extra_instruction_file(path) + for path in self._config.extra_instruction_files + ] + # Parse output_path into absolute path, handling relative paths and ~ expansion output_path_raw = self._config.output_path if not output_path_raw or not output_path_raw.strip(): @@ -186,10 +207,7 @@ def __init__(self, config: Config): def _validate_and_convert_path(self, path: str) -> Path: """Convert path to absolute and validate it's within working directory.""" - p = Path(path).expanduser() - if not p.is_absolute(): - p = self._working_dir / p - real_path = p.resolve() + real_path = self._resolve_path(path) working_dir = self._working_dir.resolve() try: real_path.relative_to(working_dir) @@ -198,6 +216,43 @@ def _validate_and_convert_path(self, path: str) -> Path: f"Path '{path}' is outside working directory '{working_dir}'") return real_path + def _resolve_path(self, path: str) -> Path: + """Resolve a path against the working directory when needed.""" + p = Path(path).expanduser() + if not p.is_absolute(): + p = self._working_dir / p + return p.resolve() + + def _validate_extra_instruction_max_bytes(self) -> None: + """Validate the configured size limit for extra instruction files.""" + if self._config.extra_instruction_max_bytes < 0: + raise ValueError("extra_instruction_max_bytes must be non-negative") + + def _validate_extra_instruction_file(self, path: str) -> Path: + """Resolve and validate an extra instruction file.""" + resolved_path = self._resolve_path(path) + if not resolved_path.exists(): + raise ValueError( + f"Extra instruction file does not exist: {resolved_path}") + if not resolved_path.is_file(): + raise ValueError( + f"Extra instruction path is not a file: {resolved_path}") + + file_size = resolved_path.stat().st_size + max_bytes = self._config.extra_instruction_max_bytes + if file_size > max_bytes: + raise ValueError( + f"Extra instruction file is larger than {max_bytes} bytes: " + f"{resolved_path}") + + try: + resolved_path.read_text(encoding='utf-8') + except UnicodeDecodeError: + raise ValueError(f"Extra instruction file is not valid UTF-8 text: " + f"{resolved_path}") + + return resolved_path + def _initialize_datacommons_dir(self) -> Path: """Initialize and return the .datacommons directory path.""" dc_dir = self._working_dir / '.datacommons' @@ -419,7 +474,10 @@ def _generate_prompt(self) -> Path: 'output_basename': self._output_basename, # Base name for pvmap/metadata files 'run_dir_abs': - str(self._run_dir) + str(self._run_dir), + 'extra_instruction_files_abs': [ + str(path) for path in self._config.extra_instruction_files + ] if self._config.extra_instruction_files else [], } # Render template with these variables @@ -440,16 +498,19 @@ def prepare_config() -> Config: input_metadata=_FLAGS.input_metadata or [], is_sdmx_dataset=_FLAGS.sdmx_dataset) - return Config(data_config=data_config, - dry_run=_FLAGS.dry_run, - maps_api_key=_FLAGS.maps_api_key, - dc_api_key=_FLAGS.dc_api_key, - max_iterations=_FLAGS.max_iterations, - skip_confirmation=_FLAGS.skip_confirmation, - enable_sandboxing=_FLAGS.enable_sandboxing, - output_path=_FLAGS.output_path, - gemini_cli=_FLAGS.gemini_cli, - working_dir=_FLAGS.working_dir) + return Config( + data_config=data_config, + dry_run=_FLAGS.dry_run, + maps_api_key=_FLAGS.maps_api_key, + dc_api_key=_FLAGS.dc_api_key, + max_iterations=_FLAGS.max_iterations, + skip_confirmation=_FLAGS.skip_confirmation, + enable_sandboxing=_FLAGS.enable_sandboxing, + output_path=_FLAGS.output_path, + gemini_cli=_FLAGS.gemini_cli, + working_dir=_FLAGS.working_dir, + extra_instruction_files=_FLAGS.extra_instruction_files or [], + extra_instruction_max_bytes=_FLAGS.extra_instruction_max_bytes) def main(_): diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py index 8109f6f009..379baec049 100644 --- a/tools/agentic_import/pvmap_generator_test.py +++ b/tools/agentic_import/pvmap_generator_test.py @@ -35,13 +35,22 @@ def setUp(self): self._data_file.write_text('header\nvalue') self._metadata_file = Path('metadata.csv') self._metadata_file.write_text('parameter,value') + self._extra_instruction_file = Path('extra_instructions.md') + self._extra_instruction_file.write_text( + 'Use provided place guidance before auto-resolution.', + encoding='utf-8') def tearDown(self): os.chdir( self._cwd) # Restore prior cwd so later tests see original state. self._temp_dir.cleanup() - def _make_generator(self, *, is_sdmx: bool) -> PVMapGenerator: + def _make_generator(self, + *, + is_sdmx: bool, + extra_instruction_files=None, + extra_instruction_max_bytes: int = 65536, + working_dir=None) -> PVMapGenerator: data_config = DataConfig( input_data=[str(self._data_file)], input_metadata=[str(self._metadata_file)], @@ -52,6 +61,9 @@ def _make_generator(self, *, is_sdmx: bool) -> PVMapGenerator: dry_run=True, max_iterations=3, output_path='output/output_file', + extra_instruction_files=extra_instruction_files or [], + extra_instruction_max_bytes=extra_instruction_max_bytes, + working_dir=working_dir, ) return PVMapGenerator(config) @@ -131,6 +143,7 @@ def _assert_prompt_content(self, prompt_path: Path, *, expect_sdmx: bool, # Working directory reference should match the temp execution root. expected_working_dir = str(Path(self._temp_dir.name).resolve()) self.assertIn(expected_working_dir, prompt_text) + self.assertIn('"extra_instruction_files": []', prompt_text) def test_generate_prompt_csv(self): generator = self._make_generator(is_sdmx=False) @@ -231,6 +244,74 @@ def test_generate_prompt_with_relative_working_dir(self): self.assertIn(str(data_file.resolve()), prompt_text) self.assertIn(str(metadata_file.resolve()), prompt_text) + def test_generate_prompt_with_relative_extra_instruction_file(self): + generator = self._make_generator( + is_sdmx=False, + extra_instruction_files=[str(self._extra_instruction_file)]) + + result = generator.generate() + + self._assert_generation_result(result) + prompt_text = self._read_prompt_path(result).read_text() + self.assertIn(str(self._extra_instruction_file.resolve()), + prompt_text, + msg='Extra instruction file path missing from prompt') + self.assertIn( + 'use them as supplemental guidance for dataset-specific mappings', + prompt_text) + self.assertIn( + 'Before starting, read all files listed in `extra_instruction_files`', + prompt_text) + self.assertIn('"extra_instruction_files": [', prompt_text) + + def test_accepts_absolute_extra_instruction_file_outside_working_directory( + self): + with tempfile.TemporaryDirectory() as other_dir: + external_file = Path(other_dir) / 'instructions.md' + external_file.write_text('Use contextual place names.', + encoding='utf-8') + + generator = self._make_generator( + is_sdmx=False, + extra_instruction_files=[str(external_file.resolve())]) + + result = generator.generate() + + self._assert_generation_result(result) + prompt_text = self._read_prompt_path(result).read_text() + self.assertIn(str(external_file.resolve()), prompt_text) + + def test_rejects_missing_extra_instruction_file(self): + with self.assertRaisesRegex(ValueError, + 'Extra instruction file does not exist'): + self._make_generator(is_sdmx=False, + extra_instruction_files=['missing.md']) + + def test_rejects_directory_extra_instruction_file(self): + extra_dir = Path('extra_dir') + extra_dir.mkdir() + with self.assertRaisesRegex(ValueError, + 'Extra instruction path is not a file'): + self._make_generator(is_sdmx=False, + extra_instruction_files=[str(extra_dir)]) + + def test_rejects_oversized_extra_instruction_file(self): + oversized_file = Path('oversized.txt') + oversized_file.write_text('abcdefghijk', encoding='utf-8') + with self.assertRaisesRegex(ValueError, + 'Extra instruction file is larger than'): + self._make_generator(is_sdmx=False, + extra_instruction_files=[str(oversized_file)], + extra_instruction_max_bytes=10) + + def test_rejects_non_utf8_extra_instruction_file(self): + binary_file = Path('binary.bin') + binary_file.write_bytes(b'\xff\xfe\x00') + with self.assertRaisesRegex( + ValueError, 'Extra instruction file is not valid UTF-8 text'): + self._make_generator(is_sdmx=False, + extra_instruction_files=[str(binary_file)]) + def test_rejects_invalid_output_path(self): data_config = DataConfig( input_data=[str(self._data_file)], diff --git a/tools/agentic_import/templates/generate_pvmap_prompt.j2 b/tools/agentic_import/templates/generate_pvmap_prompt.j2 index d180cee35d..81af2afc63 100644 --- a/tools/agentic_import/templates/generate_pvmap_prompt.j2 +++ b/tools/agentic_import/templates/generate_pvmap_prompt.j2 @@ -1316,6 +1316,9 @@ Follow these steps sequentially. - Input files and configuration is listed in `INPUT DATA FOR THIS RUN` section below. +{% if extra_instruction_files_abs %} +- If `extra_instruction_files` are listed for this run, read them first and use them as supplemental guidance for dataset-specific mappings. If they conflict, prefer this prompt's hard rules and the input files. +{% endif %} - **Carefully** examine examine **sample** input data `input_data` and metadata `input_metadata` files to understand the structure and content of the data. @@ -1466,7 +1469,8 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp "input_metadata": {{input_metadata_abs | tojson}}, "working_dir": "{{working_dir_abs}}", "output_dir": "{{output_dir_abs}}", - "dataset_type": "{{dataset_type}}" + "dataset_type": "{{dataset_type}}", + "extra_instruction_files": {{extra_instruction_files_abs | tojson}} } ``` @@ -1491,6 +1495,10 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp # ACTION REQUIRED NOW +{% if extra_instruction_files_abs %} +Before starting, read all files listed in `extra_instruction_files`; they contain critical run-specific instructions. +{% endif %} + **Execute** the data analysis and generate the `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv` files now. Follow the primary workflow **WITHOUT** deviation.