diff --git a/README.rst b/README.rst index a9562e8f2b..84967bec5c 100644 --- a/README.rst +++ b/README.rst @@ -293,6 +293,34 @@ applied directly, but should instead be manually inspected. E.g.: clas->class, clash, disabled because of name clash in c++ +Comments in dictionaries +---------------------------- + +Dictionary files may contain comments. + +1. Pure comment:: + + # comment + #comment + +2. Inline comment must be preceded by whitespace:: + + abondon->abandon #comment + abondon->abandon # comment + + The ``#`` character is treated as the start of the comment only if it is + preceded by whitespace. + +3. Invalid comment examples:: + + abondon->abandon#comment + thenumberone->the#one + the#one->thenumberone + + In such cases, the whole line is considered malformed and will be ignored. + +4. Blank lines are also ignored. + Development setup ----------------- diff --git a/codespell_lib/_spellchecker.py b/codespell_lib/_spellchecker.py index 7b511e6d3e..40f81c54cf 100644 --- a/codespell_lib/_spellchecker.py +++ b/codespell_lib/_spellchecker.py @@ -54,7 +54,17 @@ def build_dict( with open(filename, encoding="utf-8") as f: translate_tables = [(x, str.maketrans(x, y)) for x, y in alt_chars] for line in f: - [key, data] = line.split("->") + left, pound, _ = line.partition("#") + if pound and left and left[-1] not in (" ", "\t"): + continue + + line = left.strip() + if not line: + continue + try: + key, data = line.split("->") + except ValueError: + continue # TODO: For now, convert both to lower. # Someday we can maybe add support for fixing caps. key = key.lower() diff --git a/codespell_lib/tests/test_basic.py b/codespell_lib/tests/test_basic.py index 5120e1e8a1..7a23c3eb06 100644 --- a/codespell_lib/tests/test_basic.py +++ b/codespell_lib/tests/test_basic.py @@ -1524,3 +1524,36 @@ def test_args_from_file( print("Testing with direct call to cs_.main()") r = cs_.main(*args[1:]) print(f"{r=}") + + +def test_dict_comments( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], +) -> None: + """Test dictionary comments and blank lines.""" + fname = tmp_path / "bad.txt" + fname.write_text("abandonned\noccured\n") + + dictionary = tmp_path / "test.txt" + dictionary.write_text( + "#comment\n" + "# comment\n" + " #comment\n" + "\n" + "\r\n" + "abandonned->abandoned # inline comment\n" + "occured->occurred# invalid inline comment\n" + "abil#ity->ability # hash in illegal position\n" + "ability->#ability # hash in illegal position\n" + "abilityability # no arrow\n", + encoding="utf-8", + ) + + # Allow valid inline comments. + # Skip entries where '#' is not preceded by whitespace. + result = cs.main("-D", dictionary, fname, std=True) + assert isinstance(result, tuple) + code, stdout, _ = result + assert code == 1 + assert "abandonned ==> abandoned" in stdout + assert "occured ==> occurred" not in stdout