datacommonsorg · rohitkumarbhagat · Apr 20, 2026 · gemini-code-assist · Apr 20, 2026 · ajaits
diff --git a/tools/agentic_import/pvmap_generator.py b/tools/agentic_import/pvmap_generator.py
@@ -22,7 +22,7 @@
 import subprocess
 import sys
 from datetime import datetime
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import List, Optional
 
@@ -44,6 +44,11 @@ def _define_flags():
         flags.DEFINE_list('input_metadata', [],
                           'List of input metadata file paths (optional)')
 
+        flags.DEFINE_list(
+            'extra_instruction_files', [],
+            'List of extra instruction file paths to make available to Gemini '
+            '(optional)')
+
         flags.DEFINE_boolean(
             'sdmx_dataset', False,
             'Whether the dataset is in SDMX format (default: False)')
@@ -86,6 +91,11 @@ def _define_flags():
         flags.DEFINE_string(
             'working_dir', None,
             'Working directory for the generator (default: current directory)')
+
+        flags.DEFINE_integer(
+            'extra_instruction_max_bytes', 65536,
+            'Maximum allowed size in bytes for each extra instruction file '
+            '(default: 65536)')
     except flags.DuplicateFlagError:
         pass
 
@@ -110,6 +120,8 @@ class Config:
     output_path: str = 'output/output'
     gemini_cli: Optional[str] = None
     working_dir: Optional[str] = None
+    extra_instruction_files: List[str] = field(default_factory=list)
+    extra_instruction_max_bytes: int = 65536
 
 
 @dataclass
@@ -137,6 +149,8 @@ def __init__(self, config: Config):
         # Copy config to avoid modifying the original
         self._config = copy.deepcopy(config)
 
+        self._validate_extra_instruction_max_bytes()
+
         # Convert input_data paths to absolute
         if self._config.data_config.input_data:
             self._config.data_config.input_data = [
@@ -151,6 +165,13 @@ def __init__(self, config: Config):
                 for path in self._config.data_config.input_metadata
             ]
 
+        # Resolve and validate runtime extra instruction files.
+        if self._config.extra_instruction_files:
+            self._config.extra_instruction_files = [
+                self._validate_extra_instruction_file(path)
+                for path in self._config.extra_instruction_files
+            ]
+
         # Parse output_path into absolute path, handling relative paths and ~ expansion
         output_path_raw = self._config.output_path
         if not output_path_raw or not output_path_raw.strip():
@@ -186,10 +207,7 @@ def __init__(self, config: Config):
 
     def _validate_and_convert_path(self, path: str) -> Path:
         """Convert path to absolute and validate it's within working directory."""
-        p = Path(path).expanduser()
-        if not p.is_absolute():
-            p = self._working_dir / p
-        real_path = p.resolve()
+        real_path = self._resolve_path(path)
         working_dir = self._working_dir.resolve()
         try:
             real_path.relative_to(working_dir)
@@ -198,6 +216,43 @@ def _validate_and_convert_path(self, path: str) -> Path:
                 f"Path '{path}' is outside working directory '{working_dir}'")
         return real_path
 
+    def _resolve_path(self, path: str) -> Path:
+        """Resolve a path against the working directory when needed."""
+        p = Path(path).expanduser()
+        if not p.is_absolute():
+            p = self._working_dir / p
+        return p.resolve()
+
+    def _validate_extra_instruction_max_bytes(self) -> None:
+        """Validate the configured size limit for extra instruction files."""
+        if self._config.extra_instruction_max_bytes < 0:
+            raise ValueError("extra_instruction_max_bytes must be non-negative")
+
+    def _validate_extra_instruction_file(self, path: str) -> Path:
+        """Resolve and validate an extra instruction file."""
+        resolved_path = self._resolve_path(path)
-    def _validate_extra_instruction_file(self, path: str) -> Path:
-        """Resolve and validate an extra instruction file."""
-        resolved_path = self._resolve_path(path)
+    def _validate_extra_instruction_file(self, path: str) -> Path:
+        """Resolve and validate an extra instruction file."""
+        resolved_path = self._validate_and_convert_path(path)
-    def _validate_extra_instruction_file(self, path: str) -> Path:
-        """Resolve and validate an extra instruction file."""
-        resolved_path = self._resolve_path(path)
+    def _validate_extra_instruction_file(self, path: str) -> Path:
+        """Resolve and validate an extra instruction file."""
+        resolved_path = self._validate_and_convert_path(path)
+        if not resolved_path.exists():
+            raise ValueError(
+                f"Extra instruction file does not exist: {resolved_path}")
+        if not resolved_path.is_file():
+            raise ValueError(
+                f"Extra instruction path is not a file: {resolved_path}")
+
+        file_size = resolved_path.stat().st_size
+        max_bytes = self._config.extra_instruction_max_bytes
+        if file_size > max_bytes:
+            raise ValueError(
+                f"Extra instruction file is larger than {max_bytes} bytes: "
+                f"{resolved_path}")
+
+        try:
+            resolved_path.read_text(encoding='utf-8')
+        except UnicodeDecodeError:
+            raise ValueError(f"Extra instruction file is not valid UTF-8 text: "
+                             f"{resolved_path}")
+
+        return resolved_path
+
     def _initialize_datacommons_dir(self) -> Path:
         """Initialize and return the .datacommons directory path."""
         dc_dir = self._working_dir / '.datacommons'
@@ -419,7 +474,10 @@ def _generate_prompt(self) -> Path:
             'output_basename':
                 self._output_basename,  # Base name for pvmap/metadata files
             'run_dir_abs':
-                str(self._run_dir)
+                str(self._run_dir),
+            'extra_instruction_files_abs': [
+                str(path) for path in self._config.extra_instruction_files
+            ] if self._config.extra_instruction_files else [],
         }
 
         # Render template with these variables
@@ -440,16 +498,19 @@ def prepare_config() -> Config:
                              input_metadata=_FLAGS.input_metadata or [],
                              is_sdmx_dataset=_FLAGS.sdmx_dataset)
 
-    return Config(data_config=data_config,
-                  dry_run=_FLAGS.dry_run,
-                  maps_api_key=_FLAGS.maps_api_key,
-                  dc_api_key=_FLAGS.dc_api_key,
-                  max_iterations=_FLAGS.max_iterations,
-                  skip_confirmation=_FLAGS.skip_confirmation,
-                  enable_sandboxing=_FLAGS.enable_sandboxing,
-                  output_path=_FLAGS.output_path,
-                  gemini_cli=_FLAGS.gemini_cli,
-                  working_dir=_FLAGS.working_dir)
+    return Config(
+        data_config=data_config,
+        dry_run=_FLAGS.dry_run,
+        maps_api_key=_FLAGS.maps_api_key,
+        dc_api_key=_FLAGS.dc_api_key,
+        max_iterations=_FLAGS.max_iterations,
+        skip_confirmation=_FLAGS.skip_confirmation,
+        enable_sandboxing=_FLAGS.enable_sandboxing,
+        output_path=_FLAGS.output_path,
+        gemini_cli=_FLAGS.gemini_cli,
+        working_dir=_FLAGS.working_dir,
+        extra_instruction_files=_FLAGS.extra_instruction_files or [],
+        extra_instruction_max_bytes=_FLAGS.extra_instruction_max_bytes)
 
 
 def main(_):

diff --git a/tools/agentic_import/pvmap_generator_test.py b/tools/agentic_import/pvmap_generator_test.py
@@ -35,13 +35,22 @@ def setUp(self):
         self._data_file.write_text('header\nvalue')
         self._metadata_file = Path('metadata.csv')
         self._metadata_file.write_text('parameter,value')
+        self._extra_instruction_file = Path('extra_instructions.md')
+        self._extra_instruction_file.write_text(
+            'Use provided place guidance before auto-resolution.',
+            encoding='utf-8')
 
     def tearDown(self):
         os.chdir(
             self._cwd)  # Restore prior cwd so later tests see original state.
         self._temp_dir.cleanup()
 
-    def _make_generator(self, *, is_sdmx: bool) -> PVMapGenerator:
+    def _make_generator(self,
+                        *,
+                        is_sdmx: bool,
+                        extra_instruction_files=None,
+                        extra_instruction_max_bytes: int = 65536,
+                        working_dir=None) -> PVMapGenerator:
         data_config = DataConfig(
             input_data=[str(self._data_file)],
             input_metadata=[str(self._metadata_file)],
@@ -52,6 +61,9 @@ def _make_generator(self, *, is_sdmx: bool) -> PVMapGenerator:
             dry_run=True,
             max_iterations=3,
             output_path='output/output_file',
+            extra_instruction_files=extra_instruction_files or [],
+            extra_instruction_max_bytes=extra_instruction_max_bytes,
+            working_dir=working_dir,
         )
         return PVMapGenerator(config)
 
@@ -131,6 +143,7 @@ def _assert_prompt_content(self, prompt_path: Path, *, expect_sdmx: bool,
         # Working directory reference should match the temp execution root.
         expected_working_dir = str(Path(self._temp_dir.name).resolve())
         self.assertIn(expected_working_dir, prompt_text)
+        self.assertIn('"extra_instruction_files": []', prompt_text)
 
     def test_generate_prompt_csv(self):
         generator = self._make_generator(is_sdmx=False)
@@ -231,6 +244,74 @@ def test_generate_prompt_with_relative_working_dir(self):
         self.assertIn(str(data_file.resolve()), prompt_text)
         self.assertIn(str(metadata_file.resolve()), prompt_text)
 
+    def test_generate_prompt_with_relative_extra_instruction_file(self):
+        generator = self._make_generator(
+            is_sdmx=False,
+            extra_instruction_files=[str(self._extra_instruction_file)])
+
+        result = generator.generate()
+
+        self._assert_generation_result(result)
+        prompt_text = self._read_prompt_path(result).read_text()
+        self.assertIn(str(self._extra_instruction_file.resolve()),
+                      prompt_text,
+                      msg='Extra instruction file path missing from prompt')
+        self.assertIn(
+            'use them as supplemental guidance for dataset-specific mappings',
+            prompt_text)
+        self.assertIn(
+            'Before starting, read all files listed in `extra_instruction_files`',
+            prompt_text)
+        self.assertIn('"extra_instruction_files": [', prompt_text)
+
+    def test_accepts_absolute_extra_instruction_file_outside_working_directory(
+            self):
+        with tempfile.TemporaryDirectory() as other_dir:
+            external_file = Path(other_dir) / 'instructions.md'
+            external_file.write_text('Use contextual place names.',
+                                     encoding='utf-8')
+
+            generator = self._make_generator(
+                is_sdmx=False,
+                extra_instruction_files=[str(external_file.resolve())])
+
+            result = generator.generate()
+
+            self._assert_generation_result(result)
+            prompt_text = self._read_prompt_path(result).read_text()
+            self.assertIn(str(external_file.resolve()), prompt_text)
+
+    def test_rejects_missing_extra_instruction_file(self):
+        with self.assertRaisesRegex(ValueError,
+                                    'Extra instruction file does not exist'):
+            self._make_generator(is_sdmx=False,
+                                 extra_instruction_files=['missing.md'])
+
+    def test_rejects_directory_extra_instruction_file(self):
+        extra_dir = Path('extra_dir')
+        extra_dir.mkdir()
+        with self.assertRaisesRegex(ValueError,
+                                    'Extra instruction path is not a file'):
+            self._make_generator(is_sdmx=False,
+                                 extra_instruction_files=[str(extra_dir)])
+
+    def test_rejects_oversized_extra_instruction_file(self):
+        oversized_file = Path('oversized.txt')
+        oversized_file.write_text('abcdefghijk', encoding='utf-8')
+        with self.assertRaisesRegex(ValueError,
+                                    'Extra instruction file is larger than'):
+            self._make_generator(is_sdmx=False,
+                                 extra_instruction_files=[str(oversized_file)],
+                                 extra_instruction_max_bytes=10)
+
+    def test_rejects_non_utf8_extra_instruction_file(self):
+        binary_file = Path('binary.bin')
+        binary_file.write_bytes(b'\xff\xfe\x00')
+        with self.assertRaisesRegex(
+                ValueError, 'Extra instruction file is not valid UTF-8 text'):
+            self._make_generator(is_sdmx=False,
+                                 extra_instruction_files=[str(binary_file)])
+
     def test_rejects_invalid_output_path(self):
         data_config = DataConfig(
             input_data=[str(self._data_file)],

diff --git a/tools/agentic_import/templates/generate_pvmap_prompt.j2 b/tools/agentic_import/templates/generate_pvmap_prompt.j2
@@ -1316,6 +1316,9 @@ Follow these steps sequentially.
 
 - Input files and configuration is listed in `INPUT DATA FOR THIS RUN` section
   below.
+{% if extra_instruction_files_abs %}
+- If `extra_instruction_files` are listed for this run, read them first and use them as supplemental guidance for dataset-specific mappings. If they conflict, prefer this prompt's hard rules and the input files.
+{% endif %}
 - **Carefully** examine examine **sample** input data `input_data` and metadata
   `input_metadata` files to understand the structure and content of the data.
 
@@ -1466,7 +1469,8 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp
   "input_metadata": {{input_metadata_abs | tojson}},
   "working_dir": "{{working_dir_abs}}",
   "output_dir": "{{output_dir_abs}}",
-  "dataset_type": "{{dataset_type}}"
+  "dataset_type": "{{dataset_type}}",
+  "extra_instruction_files": {{extra_instruction_files_abs | tojson}}
 }
 ```
 
@@ -1491,6 +1495,10 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp
 
 # ACTION REQUIRED NOW
 
+{% if extra_instruction_files_abs %}
+Before starting, read all files listed in `extra_instruction_files`; they contain critical run-specific instructions.
+{% endif %}
+
 **Execute** the data analysis and generate the `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv`
 files now. Follow the primary workflow **WITHOUT** deviation.