Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 77 additions & 16 deletions tools/agentic_import/pvmap_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import subprocess
import sys
from datetime import datetime
from dataclasses import dataclass
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Optional

Expand All @@ -44,6 +44,11 @@ def _define_flags():
flags.DEFINE_list('input_metadata', [],
'List of input metadata file paths (optional)')

flags.DEFINE_list(
'extra_instruction_files', [],
'List of extra instruction file paths to make available to Gemini '
'(optional)')

flags.DEFINE_boolean(
'sdmx_dataset', False,
'Whether the dataset is in SDMX format (default: False)')
Expand Down Expand Up @@ -86,6 +91,11 @@ def _define_flags():
flags.DEFINE_string(
'working_dir', None,
'Working directory for the generator (default: current directory)')

flags.DEFINE_integer(
'extra_instruction_max_bytes', 65536,
'Maximum allowed size in bytes for each extra instruction file '
'(default: 65536)')
except flags.DuplicateFlagError:
pass

Expand All @@ -110,6 +120,8 @@ class Config:
output_path: str = 'output/output'
gemini_cli: Optional[str] = None
working_dir: Optional[str] = None
extra_instruction_files: List[str] = field(default_factory=list)
extra_instruction_max_bytes: int = 65536


@dataclass
Expand Down Expand Up @@ -137,6 +149,8 @@ def __init__(self, config: Config):
# Copy config to avoid modifying the original
self._config = copy.deepcopy(config)

self._validate_extra_instruction_max_bytes()

# Convert input_data paths to absolute
if self._config.data_config.input_data:
self._config.data_config.input_data = [
Expand All @@ -151,6 +165,13 @@ def __init__(self, config: Config):
for path in self._config.data_config.input_metadata
]

# Resolve and validate runtime extra instruction files.
if self._config.extra_instruction_files:
self._config.extra_instruction_files = [
self._validate_extra_instruction_file(path)
for path in self._config.extra_instruction_files
]

# Parse output_path into absolute path, handling relative paths and ~ expansion
output_path_raw = self._config.output_path
if not output_path_raw or not output_path_raw.strip():
Expand Down Expand Up @@ -186,10 +207,7 @@ def __init__(self, config: Config):

def _validate_and_convert_path(self, path: str) -> Path:
"""Convert path to absolute and validate it's within working directory."""
p = Path(path).expanduser()
if not p.is_absolute():
p = self._working_dir / p
real_path = p.resolve()
real_path = self._resolve_path(path)
working_dir = self._working_dir.resolve()
try:
real_path.relative_to(working_dir)
Expand All @@ -198,6 +216,43 @@ def _validate_and_convert_path(self, path: str) -> Path:
f"Path '{path}' is outside working directory '{working_dir}'")
return real_path

def _resolve_path(self, path: str) -> Path:
"""Resolve a path against the working directory when needed."""
p = Path(path).expanduser()
if not p.is_absolute():
p = self._working_dir / p
return p.resolve()

def _validate_extra_instruction_max_bytes(self) -> None:
"""Validate the configured size limit for extra instruction files."""
if self._config.extra_instruction_max_bytes < 0:
raise ValueError("extra_instruction_max_bytes must be non-negative")

def _validate_extra_instruction_file(self, path: str) -> Path:
"""Resolve and validate an extra instruction file."""
resolved_path = self._resolve_path(path)
Comment on lines +231 to +233
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The _validate_extra_instruction_file method uses _resolve_path which allows files outside the working directory. However, if enable_sandboxing is set to True (which is the default on macOS), the Gemini agent running in the sandbox will likely be restricted to the working directory and will fail to read these extra instruction files at runtime.

To ensure the agent can actually access these files, consider enforcing the same working directory boundary as input_data and input_metadata by using _validate_and_convert_path instead of _resolve_path.

Suggested change
def _validate_extra_instruction_file(self, path: str) -> Path:
"""Resolve and validate an extra instruction file."""
resolved_path = self._resolve_path(path)
def _validate_extra_instruction_file(self, path: str) -> Path:
"""Resolve and validate an extra instruction file."""
resolved_path = self._validate_and_convert_path(path)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls check if Gemini suggestion for sandbox mode should be supported

if not resolved_path.exists():
raise ValueError(
f"Extra instruction file does not exist: {resolved_path}")
if not resolved_path.is_file():
raise ValueError(
f"Extra instruction path is not a file: {resolved_path}")

file_size = resolved_path.stat().st_size
max_bytes = self._config.extra_instruction_max_bytes
if file_size > max_bytes:
raise ValueError(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we truncate file with a warning rather than terminate program or in future summarize the file to fit in size threshold?

f"Extra instruction file is larger than {max_bytes} bytes: "
f"{resolved_path}")

try:
resolved_path.read_text(encoding='utf-8')
except UnicodeDecodeError:
raise ValueError(f"Extra instruction file is not valid UTF-8 text: "
f"{resolved_path}")

return resolved_path

def _initialize_datacommons_dir(self) -> Path:
"""Initialize and return the .datacommons directory path."""
dc_dir = self._working_dir / '.datacommons'
Expand Down Expand Up @@ -419,7 +474,10 @@ def _generate_prompt(self) -> Path:
'output_basename':
self._output_basename, # Base name for pvmap/metadata files
'run_dir_abs':
str(self._run_dir)
str(self._run_dir),
'extra_instruction_files_abs': [
str(path) for path in self._config.extra_instruction_files
] if self._config.extra_instruction_files else [],
}

# Render template with these variables
Expand All @@ -440,16 +498,19 @@ def prepare_config() -> Config:
input_metadata=_FLAGS.input_metadata or [],
is_sdmx_dataset=_FLAGS.sdmx_dataset)

return Config(data_config=data_config,
dry_run=_FLAGS.dry_run,
maps_api_key=_FLAGS.maps_api_key,
dc_api_key=_FLAGS.dc_api_key,
max_iterations=_FLAGS.max_iterations,
skip_confirmation=_FLAGS.skip_confirmation,
enable_sandboxing=_FLAGS.enable_sandboxing,
output_path=_FLAGS.output_path,
gemini_cli=_FLAGS.gemini_cli,
working_dir=_FLAGS.working_dir)
return Config(
data_config=data_config,
dry_run=_FLAGS.dry_run,
maps_api_key=_FLAGS.maps_api_key,
dc_api_key=_FLAGS.dc_api_key,
max_iterations=_FLAGS.max_iterations,
skip_confirmation=_FLAGS.skip_confirmation,
enable_sandboxing=_FLAGS.enable_sandboxing,
output_path=_FLAGS.output_path,
gemini_cli=_FLAGS.gemini_cli,
working_dir=_FLAGS.working_dir,
extra_instruction_files=_FLAGS.extra_instruction_files or [],
extra_instruction_max_bytes=_FLAGS.extra_instruction_max_bytes)


def main(_):
Expand Down
83 changes: 82 additions & 1 deletion tools/agentic_import/pvmap_generator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,22 @@ def setUp(self):
self._data_file.write_text('header\nvalue')
self._metadata_file = Path('metadata.csv')
self._metadata_file.write_text('parameter,value')
self._extra_instruction_file = Path('extra_instructions.md')
self._extra_instruction_file.write_text(
'Use provided place guidance before auto-resolution.',
encoding='utf-8')

def tearDown(self):
os.chdir(
self._cwd) # Restore prior cwd so later tests see original state.
self._temp_dir.cleanup()

def _make_generator(self, *, is_sdmx: bool) -> PVMapGenerator:
def _make_generator(self,
*,
is_sdmx: bool,
extra_instruction_files=None,
extra_instruction_max_bytes: int = 65536,
working_dir=None) -> PVMapGenerator:
data_config = DataConfig(
input_data=[str(self._data_file)],
input_metadata=[str(self._metadata_file)],
Expand All @@ -52,6 +61,9 @@ def _make_generator(self, *, is_sdmx: bool) -> PVMapGenerator:
dry_run=True,
max_iterations=3,
output_path='output/output_file',
extra_instruction_files=extra_instruction_files or [],
extra_instruction_max_bytes=extra_instruction_max_bytes,
working_dir=working_dir,
)
return PVMapGenerator(config)

Expand Down Expand Up @@ -131,6 +143,7 @@ def _assert_prompt_content(self, prompt_path: Path, *, expect_sdmx: bool,
# Working directory reference should match the temp execution root.
expected_working_dir = str(Path(self._temp_dir.name).resolve())
self.assertIn(expected_working_dir, prompt_text)
self.assertIn('"extra_instruction_files": []', prompt_text)

def test_generate_prompt_csv(self):
generator = self._make_generator(is_sdmx=False)
Expand Down Expand Up @@ -231,6 +244,74 @@ def test_generate_prompt_with_relative_working_dir(self):
self.assertIn(str(data_file.resolve()), prompt_text)
self.assertIn(str(metadata_file.resolve()), prompt_text)

def test_generate_prompt_with_relative_extra_instruction_file(self):
generator = self._make_generator(
is_sdmx=False,
extra_instruction_files=[str(self._extra_instruction_file)])

result = generator.generate()

self._assert_generation_result(result)
prompt_text = self._read_prompt_path(result).read_text()
self.assertIn(str(self._extra_instruction_file.resolve()),
prompt_text,
msg='Extra instruction file path missing from prompt')
self.assertIn(
'use them as supplemental guidance for dataset-specific mappings',
prompt_text)
self.assertIn(
'Before starting, read all files listed in `extra_instruction_files`',
prompt_text)
self.assertIn('"extra_instruction_files": [', prompt_text)

def test_accepts_absolute_extra_instruction_file_outside_working_directory(
self):
with tempfile.TemporaryDirectory() as other_dir:
external_file = Path(other_dir) / 'instructions.md'
external_file.write_text('Use contextual place names.',
encoding='utf-8')

generator = self._make_generator(
is_sdmx=False,
extra_instruction_files=[str(external_file.resolve())])

result = generator.generate()

self._assert_generation_result(result)
prompt_text = self._read_prompt_path(result).read_text()
self.assertIn(str(external_file.resolve()), prompt_text)

def test_rejects_missing_extra_instruction_file(self):
with self.assertRaisesRegex(ValueError,
'Extra instruction file does not exist'):
self._make_generator(is_sdmx=False,
extra_instruction_files=['missing.md'])

def test_rejects_directory_extra_instruction_file(self):
extra_dir = Path('extra_dir')
extra_dir.mkdir()
with self.assertRaisesRegex(ValueError,
'Extra instruction path is not a file'):
self._make_generator(is_sdmx=False,
extra_instruction_files=[str(extra_dir)])

def test_rejects_oversized_extra_instruction_file(self):
oversized_file = Path('oversized.txt')
oversized_file.write_text('abcdefghijk', encoding='utf-8')
with self.assertRaisesRegex(ValueError,
'Extra instruction file is larger than'):
self._make_generator(is_sdmx=False,
extra_instruction_files=[str(oversized_file)],
extra_instruction_max_bytes=10)

def test_rejects_non_utf8_extra_instruction_file(self):
binary_file = Path('binary.bin')
binary_file.write_bytes(b'\xff\xfe\x00')
with self.assertRaisesRegex(
ValueError, 'Extra instruction file is not valid UTF-8 text'):
self._make_generator(is_sdmx=False,
extra_instruction_files=[str(binary_file)])

def test_rejects_invalid_output_path(self):
data_config = DataConfig(
input_data=[str(self._data_file)],
Expand Down
10 changes: 9 additions & 1 deletion tools/agentic_import/templates/generate_pvmap_prompt.j2
Original file line number Diff line number Diff line change
Expand Up @@ -1316,6 +1316,9 @@ Follow these steps sequentially.

- Input files and configuration is listed in `INPUT DATA FOR THIS RUN` section
below.
{% if extra_instruction_files_abs %}
- If `extra_instruction_files` are listed for this run, read them first and use them as supplemental guidance for dataset-specific mappings. If they conflict, prefer this prompt's hard rules and the input files.
{% endif %}
- **Carefully** examine examine **sample** input data `input_data` and metadata
`input_metadata` files to understand the structure and content of the data.

Expand Down Expand Up @@ -1466,7 +1469,8 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp
"input_metadata": {{input_metadata_abs | tojson}},
"working_dir": "{{working_dir_abs}}",
"output_dir": "{{output_dir_abs}}",
"dataset_type": "{{dataset_type}}"
"dataset_type": "{{dataset_type}}",
"extra_instruction_files": {{extra_instruction_files_abs | tojson}}
}
```

Expand All @@ -1491,6 +1495,10 @@ CRITICAL: Follow all SDMX-specific guidelines and use metadata for semantic mapp

# ACTION REQUIRED NOW

{% if extra_instruction_files_abs %}
Before starting, read all files listed in `extra_instruction_files`; they contain critical run-specific instructions.
{% endif %}

**Execute** the data analysis and generate the `{{output_path_abs}}_pvmap.csv` and `{{output_path_abs}}_metadata.csv`
files now. Follow the primary workflow **WITHOUT** deviation.

Expand Down
Loading