From c687a5f96211b2cb3807cdd09e8933711aa50e37 Mon Sep 17 00:00:00 2001 From: Taher Date: Tue, 3 Feb 2026 19:37:32 -0500 Subject: [PATCH 1/9] FEAT: Generalize Colloquial Wordswap Attack Converter (#418) --- .../colloquial_wordswaps/filipino.yaml | 15 +++++ .../colloquial_wordswaps/indian.yaml | 15 +++++ .../multicultural_london.yaml | 15 +++++ .../colloquial_wordswaps/singaporean.yaml | 15 +++++ .../southern_american.yaml | 15 +++++ .../colloquial_wordswap_converter.py | 61 +++++++++++-------- .../test_colloquial_wordswap_converter.py | 33 +++++++--- 7 files changed, 136 insertions(+), 33 deletions(-) create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml create mode 100644 pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml new file mode 100644 index 0000000000..0bcf1dedc8 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml @@ -0,0 +1,15 @@ +father: ["papa", "itay", "tay", "dad", "erpat"] +mother: ["mama", "inay", "nay", "mom", "ermat"] +grandfather: ["lolo", "lo", "lolo paps"] +grandmother: ["lola", "la", "lola mams"] +girl: ["nene", "inday", "ganda"] +boy: ["totoy", "dodong", "pogi"] +son: ["anak", "junior", "boy"] +daughter: ["anak", "nene", "ga"] +aunty: ["tita", "tiya", "tante"] +aunt: ["tita", "tiyang"] +man: ["kuya", "boss", "pare", "lodi", "chong"] +woman: ["ate", "miss", "ganda", "marites"] +uncle: ["tito", "tiyo", "tito paps"] +sister: ["ate", "sis", "sissy"] +brother: ["kuya", "bro", "paps", "tol"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml new file mode 100644 index 0000000000..56d5da5c3b --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml @@ -0,0 +1,15 @@ +father: ["papa", "bauji", "abba", "pitaji", "dad", "pop"] +mother: ["mummy", "maa", "ammi", "mataji", "amman"] +grandfather: ["dada", "nana", "dadaji", "nanaji"] +grandmother: ["dadi", "nani", "dadiji", "naniji"] +girl: ["beti", "kudi", "larki", "gudiya"] +boy: ["beta", "munda", "larka", "chokra"] +son: ["beta", "ladla", "puttar"] +daughter: ["beti", "bitiya", "laado"] +aunty: ["aunty", "mausi", "bua", "chachi", "mami"] +aunt: ["aunty", "mausi", "bua", "chachi", "mami"] +man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] +woman: ["behenji", "didiji", "bandi", "memsaab"] +uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] +sister: ["didi", "behen", "behna", "sis"] +brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml new file mode 100644 index 0000000000..3b72c0530a --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -0,0 +1,15 @@ +father: ["pops", "old man", "dadman", "pa"] +mother: ["mumsy", "moms", "mummy", "ma"] +grandfather: ["grandad", "gramps"] +grandmother: ["nan", "nanna", "gran"] +girl: ["gyal", "ting", "shorty", " peng ting"] +boy: ["yute", "man", "lil man"] +son: ["yute", "junior", "son-son"] +daughter: ["princess", "baby girl"] +aunty: ["aunty", "tantie"] +aunt: ["aunty", "aunt"] +man: ["mandem", "man", "geezer", "bloke"] +woman: ["gyal", "lady", "madam"] +uncle: ["unc", "uncle"] +sister: ["sis", "sissy"] +brother: ["bruv", "bredrin", "fam", "blood", "brudda"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml new file mode 100644 index 0000000000..1e87da2979 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml @@ -0,0 +1,15 @@ +father: ["papa", "lao bei", "lim pei", "bapa", "appa"] +mother: ["mama", "amma", "ibu"] +grandfather: ["ah gong", "thatha", "dato"] +grandmother: ["ah ma", "patti", "nenek"] +girl: ["ah ger", "ponnu"] +boy: ["ah boy", "boi", "payyan"] +son: ["ah boy", "boi", "payyan"] +daughter: ["ah ger", "ponnu"] +aunty: ["makcik", "maami"] +aunt: ["makcik", "maami"] +man: ["ah beng", "shuai ge"] +woman: ["ah lian", "xiao mei"] +uncle: ["encik", "unker"] +sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] +brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] \ No newline at end of file diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml new file mode 100644 index 0000000000..a5b7ff4511 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml @@ -0,0 +1,15 @@ +father: ["daddy", "pa", "pops", "old man"] +mother: ["mama", "momma", "ma"] +grandfather: ["pawpaw", "grandpappy", "gramps"] +grandmother: ["mawmaw", "meemaw", "nana", "grammy"] +girl: ["gal", "missy", "sugar"] +boy: ["bubba", "sonny", "boy"] +son: ["junior", "bub", "son"] +daughter: ["sissy", "honey", "sugar"] +aunty: ["auntie"] +aunt: ["auntie"] +man: ["fella", "hoss", "bubba"] +woman: ["ma'am", "lady", "missy"] +uncle: ["unk"] +sister: ["sissy", "sis"] +brother: ["bubba", "bro"] \ No newline at end of file diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 96ab2d6c15..94e110c324 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -3,7 +3,10 @@ import random import re -from typing import Dict, List, Optional +from pathlib import Path +from typing import Optional + +import yaml from pyrit.identifiers import ConverterIdentifier from pyrit.models import PromptDataType @@ -18,38 +21,44 @@ class ColloquialWordswapConverter(PromptConverter): SUPPORTED_INPUT_TYPES = ("text",) SUPPORTED_OUTPUT_TYPES = ("text",) - def __init__( - self, deterministic: bool = False, custom_substitutions: Optional[Dict[str, List[str]]] = None - ) -> None: + def __init__(self, deterministic: bool = False, wordswap_path: Optional[str] = None) -> None: """ Initialize the converter with optional deterministic mode and custom substitutions. Args: deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of custom substitutions to - override the defaults. Defaults to None. + wordswap_path (Optional[str]): File name of a YAML file in ../../datasets/prompt_converters/colloquial_wordswaps + directory containing a dictionary of substitutions. Defaults to None. + + Raises: + FileNotFoundError: If the wordswap YAML file is not found. + ValueError: If the YAML file is formatted incorrectly or empty. """ - default_substitutions = { - "father": ["papa", "lao bei", "lim pei", "bapa", "appa"], - "mother": ["mama", "amma", "ibu"], - "grandfather": ["ah gong", "thatha", "dato"], - "grandmother": ["ah ma", "patti", "nenek"], - "girl": ["ah ger", "ponnu"], - "boy": ["ah boy", "boi", "payyan"], - "son": ["ah boy", "boi", "payyan"], - "daughter": ["ah ger", "ponnu"], - "aunt": ["makcik", "maami"], - "aunty": ["makcik", "maami"], - "man": ["ah beng", "shuai ge"], - "woman": ["ah lian", "xiao mei"], - "uncle": ["encik", "unker"], - "sister": ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"], - "brother": ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"], - } - - # Use custom substitutions if provided, otherwise default to the standard ones - self._colloquial_substitutions = custom_substitutions if custom_substitutions else default_substitutions + # Use custom substitutions if wordswap_path provided, otherwise default to singaporean.yaml + if wordswap_path: + file_path = ( + Path(__file__).parent.parent / "datasets" / "prompt_converters" / "colloquial_wordswaps" / wordswap_path + ) + else: + file_path = ( + Path(__file__).parent.parent + / "datasets" + / "prompt_converters" + / "colloquial_wordswaps" + / "singaporean.yaml" + ) + + if not file_path.exists(): + raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + # Ensure that wordswap YAML is in the correct format. + if not isinstance(data, dict): + raise ValueError("Wordswap YAML must contain a dictionary of word -> list of substitutions") + + self._colloquial_substitutions = data self._deterministic = deterministic def _build_identifier(self) -> ConverterIdentifier: diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index f8203511fa..520a173dca 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -61,21 +61,24 @@ async def test_colloquial_non_deterministic(input_text): assert output_word == input_word -# Test for custom substitutions +# Test for nondefault substitutions. @pytest.mark.asyncio @pytest.mark.parametrize( - "input_text,custom_substitutions,expected_output", + "input_text,wordswap_path,expected_output", [ - ("father", {"father": ["appa", "darth vader"]}, "appa"), # Custom substitution father -> appa + ("father", "filipino.yaml", "papa"), + ("woman", "indian.yaml", "behenji"), + ("son", "southern_american.yaml", "junior"), + ("man", "multicultural_london.yaml", "mandem"), ], ) -async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_substitutions) +async def test_colloquial_custom_substitutions(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output -# Test for empty custom substitutions +# Test for empty wordswap_path @pytest.mark.asyncio @pytest.mark.parametrize( "input_text,expected_output", @@ -84,7 +87,7 @@ async def test_colloquial_custom_substitutions(input_text, custom_substitutions, ], ) async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path="") result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output @@ -105,6 +108,22 @@ async def test_multiple_words(input_text, expected_output): assert result.output_text == expected_output +# Test multiple word prompts for custom colloquialism +@pytest.mark.asyncio +@pytest.mark.parametrize( + "input_text,wordswap_path,expected_output", + [ + ("father and mother", "indian.yaml", "papa and mummy"), + ("brother and sister", "southern_american.yaml", "bubba and sissy"), + ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), + ], +) +async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) + result = await converter.convert_async(prompt=input_text) + assert result.output_text == expected_output + + # Test for awkward spacing @pytest.mark.asyncio @pytest.mark.parametrize( From 675e6ce4d4911a348951a8e2f1ba156f1bfae9d5 Mon Sep 17 00:00:00 2001 From: Taher Date: Tue, 3 Feb 2026 21:32:06 -0500 Subject: [PATCH 2/9] style: fix missing newlines at the end of yaml files --- .../prompt_converters/colloquial_wordswaps/filipino.yaml | 2 +- .../datasets/prompt_converters/colloquial_wordswaps/indian.yaml | 2 +- .../colloquial_wordswaps/multicultural_london.yaml | 2 +- .../prompt_converters/colloquial_wordswaps/singaporean.yaml | 2 +- .../colloquial_wordswaps/southern_american.yaml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml index 0bcf1dedc8..e7bbcb9ccb 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml @@ -12,4 +12,4 @@ man: ["kuya", "boss", "pare", "lodi", "chong"] woman: ["ate", "miss", "ganda", "marites"] uncle: ["tito", "tiyo", "tito paps"] sister: ["ate", "sis", "sissy"] -brother: ["kuya", "bro", "paps", "tol"] \ No newline at end of file +brother: ["kuya", "bro", "paps", "tol"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml index 56d5da5c3b..e0798310ac 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml @@ -12,4 +12,4 @@ man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] woman: ["behenji", "didiji", "bandi", "memsaab"] uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] sister: ["didi", "behen", "behna", "sis"] -brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] \ No newline at end of file +brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml index 3b72c0530a..874765856a 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -12,4 +12,4 @@ man: ["mandem", "man", "geezer", "bloke"] woman: ["gyal", "lady", "madam"] uncle: ["unc", "uncle"] sister: ["sis", "sissy"] -brother: ["bruv", "bredrin", "fam", "blood", "brudda"] \ No newline at end of file +brother: ["bruv", "bredrin", "fam", "blood", "brudda"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml index 1e87da2979..f64e94a972 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml @@ -12,4 +12,4 @@ man: ["ah beng", "shuai ge"] woman: ["ah lian", "xiao mei"] uncle: ["encik", "unker"] sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] -brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] \ No newline at end of file +brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml index a5b7ff4511..4dfa65ec30 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml @@ -12,4 +12,4 @@ man: ["fella", "hoss", "bubba"] woman: ["ma'am", "lady", "missy"] uncle: ["unk"] sister: ["sissy", "sis"] -brother: ["bubba", "bro"] \ No newline at end of file +brother: ["bubba", "bro"] From a616b7ee16653a24affcf2cc4604f55859ad597c Mon Sep 17 00:00:00 2001 From: Taher Date: Fri, 6 Feb 2026 14:20:45 -0500 Subject: [PATCH 3/9] Fixed function signature and file path --- .../colloquial_wordswap_converter.py | 58 +++++++++++-------- .../test_colloquial_wordswap_converter.py | 18 +++++- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 94e110c324..972b9134df 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -3,11 +3,11 @@ import random import re -from pathlib import Path -from typing import Optional +from typing import Dict, List, Optional import yaml +from pyrit.common.path import DATASETS_PATH from pyrit.identifiers import ConverterIdentifier from pyrit.models import PromptDataType from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter @@ -21,44 +21,54 @@ class ColloquialWordswapConverter(PromptConverter): SUPPORTED_INPUT_TYPES = ("text",) SUPPORTED_OUTPUT_TYPES = ("text",) - def __init__(self, deterministic: bool = False, wordswap_path: Optional[str] = None) -> None: + def __init__( + self, + deterministic: bool = False, + *, + custom_substitutions: Optional[Dict[str, List[str]]] = None, + wordswap_path: Optional[str] = None, + ) -> None: """ Initialize the converter with optional deterministic mode and custom substitutions. Args: deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - wordswap_path (Optional[str]): File name of a YAML file in ../../datasets/prompt_converters/colloquial_wordswaps - directory containing a dictionary of substitutions. Defaults to None. + custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of + custom substitutions to override the defaults. Defaults to none. + wordswap_path (Optional[str]): Name of a YAML file located in the + PyRIT datasets prompt_converters/colloquial_wordswaps directory. Raises: FileNotFoundError: If the wordswap YAML file is not found. - ValueError: If the YAML file is formatted incorrectly or empty. + ValueError: If both parameters are provided or YAML format is invalid. """ - # Use custom substitutions if wordswap_path provided, otherwise default to singaporean.yaml - if wordswap_path: - file_path = ( - Path(__file__).parent.parent / "datasets" / "prompt_converters" / "colloquial_wordswaps" / wordswap_path - ) + if custom_substitutions is not None and wordswap_path is not None: + raise ValueError("Provide either custom_substitutions or wordswap_path, not both.") + + wordswap_directory = DATASETS_PATH / "prompt_converters" / "colloquial_wordswaps" + + # custom_substitution arg prioritization + if custom_substitutions is not None: + self._colloquial_substitutions = custom_substitutions else: + # if neither custom_sub nor wordswap_path is given then singaporean substituions are used file_path = ( - Path(__file__).parent.parent - / "datasets" - / "prompt_converters" - / "colloquial_wordswaps" - / "singaporean.yaml" + wordswap_directory / wordswap_path + if wordswap_path is not None + else wordswap_directory / "singaporean.yaml" ) - if not file_path.exists(): - raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + if not file_path.exists(): + raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + # ensure that wordswap YAML is in the correct format. + if not isinstance(data, dict): + raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions") - with file_path.open("r", encoding="utf-8") as f: - data = yaml.safe_load(f) - # Ensure that wordswap YAML is in the correct format. - if not isinstance(data, dict): - raise ValueError("Wordswap YAML must contain a dictionary of word -> list of substitutions") + self._colloquial_substitutions = data - self._colloquial_substitutions = data self._deterministic = deterministic def _build_identifier(self) -> ConverterIdentifier: diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index 520a173dca..dbc9ed5709 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -87,7 +87,7 @@ async def test_colloquial_custom_substitutions(input_text, wordswap_path, expect ], ) async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, wordswap_path="") + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=None) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output @@ -158,3 +158,19 @@ def test_colloquial_converter_input_supported() -> None: converter = ColloquialWordswapConverter() assert converter.input_supported("text") is True assert converter.input_supported("image_path") is False + + +# Test that the constructor raises a ValueError when both custom_substitutions and wordswap_path are provided. +def test_init_conflict_custom_substitutions_and_path(): + with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"): + ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml") + + +# test to check if direct dictionary of substitutions is passed and applies to prompt conversion correctly +@pytest.mark.asyncio +async def test_init_with_custom_substitutions_dict(): + custom_subs = {"hello": ["hi", "hey"], "world": ["earth"]} + # Use deterministic=True to ensure it picks the first item ("hi") for assertion + converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_subs) + result = await converter.convert_async(prompt="Hello world") + assert result.output_text == "hi earth" From 4190739010c13dc9c487fb7f29b72c96a20ebc5c Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Wed, 25 Feb 2026 11:41:32 -0800 Subject: [PATCH 4/9] FEAT: Generalize ColloquialWordswapConverter to support regional YAML files - Add wordswap_path parameter to load substitutions from YAML files - Keep custom_substitutions for backward compatibility - Add YAML files: singaporean, filipino, indian, multicultural_london, southern_american - Move hardcoded substitutions to singaporean.yaml (remains the default) - Use CONVERTER_SEED_PROMPT_PATH for consistent path handling - Make all __init__ params keyword-only for consistency with other converters - Add input validation: YAML format, value types, mutual exclusion of params - Catch yaml.YAMLError and re-raise as ValueError with file context - Include wordswap_path in component identifier for proper tracking - Add tests for YAML paths, error paths, and custom dict substitutions Closes #418 Co-authored-by: taherakolawala <181370151+taherakolawala@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../multicultural_london.yaml | 2 +- .../colloquial_wordswap_converter.py | 65 ++++++++----- .../test_colloquial_wordswap_converter.py | 95 ++++++++++++------- 3 files changed, 106 insertions(+), 56 deletions(-) diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml index 874765856a..fad7b2f437 100644 --- a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -2,7 +2,7 @@ father: ["pops", "old man", "dadman", "pa"] mother: ["mumsy", "moms", "mummy", "ma"] grandfather: ["grandad", "gramps"] grandmother: ["nan", "nanna", "gran"] -girl: ["gyal", "ting", "shorty", " peng ting"] +girl: ["gyal", "ting", "shorty", "peng ting"] boy: ["yute", "man", "lil man"] son: ["yute", "junior", "son-son"] daughter: ["princess", "baby girl"] diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index fcd456bc56..6d9555a7ee 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -1,13 +1,14 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import pathlib import random import re from typing import Optional import yaml -from pyrit.common.path import DATASETS_PATH +from pyrit.common.path import CONVERTER_SEED_PROMPT_PATH from pyrit.identifiers import ComponentIdentifier from pyrit.models import PromptDataType from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter @@ -15,7 +16,10 @@ class ColloquialWordswapConverter(PromptConverter): """ - Converts text into colloquial Singaporean context. + Converts text by replacing words with regional colloquial alternatives. + + Supports loading substitutions from YAML files (e.g., Singaporean, Filipino, Indian) + or accepting a custom substitution dictionary. Defaults to Singaporean substitutions. """ SUPPORTED_INPUT_TYPES = ("text",) @@ -23,49 +27,63 @@ class ColloquialWordswapConverter(PromptConverter): def __init__( self, - deterministic: bool = False, *, + deterministic: bool = False, custom_substitutions: Optional[dict[str, list[str]]] = None, wordswap_path: Optional[str] = None, ) -> None: """ - Initialize the converter with optional deterministic mode and custom substitutions. + Initialize the converter with optional deterministic mode and substitutions source. Args: deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of - custom substitutions to override the defaults. Defaults to none. - wordswap_path (Optional[str]): Name of a YAML file located in the - PyRIT datasets prompt_converters/colloquial_wordswaps directory. + custom_substitutions (Optional[dict[str, list[str]]]): A dictionary of custom substitutions + to override the defaults. Defaults to None. + wordswap_path (Optional[str]): Path to a YAML file containing word substitutions. + Can be a filename within the built-in colloquial_wordswaps directory (e.g., "filipino.yaml") + or an absolute path to a custom YAML file. Defaults to None (uses singaporean.yaml). Raises: - FileNotFoundError: If the wordswap YAML file is not found. - ValueError: If both parameters are provided or YAML format is invalid. + ValueError: If both custom_substitutions and wordswap_path are provided, + or if the YAML file has an invalid format. + FileNotFoundError: If the specified wordswap YAML file does not exist. """ if custom_substitutions is not None and wordswap_path is not None: raise ValueError("Provide either custom_substitutions or wordswap_path, not both.") - wordswap_directory = DATASETS_PATH / "prompt_converters" / "colloquial_wordswaps" + self._wordswap_path = wordswap_path - # custom_substitution arg prioritization if custom_substitutions is not None: self._colloquial_substitutions = custom_substitutions else: - # if neither custom_sub nor wordswap_path is given then singaporean substituions are used - file_path = ( - wordswap_directory / wordswap_path - if wordswap_path is not None - else wordswap_directory / "singaporean.yaml" - ) + wordswap_directory = CONVERTER_SEED_PROMPT_PATH / "colloquial_wordswaps" + + if wordswap_path is not None: + file_path = pathlib.Path(wordswap_path) + if not file_path.is_absolute(): + file_path = wordswap_directory / wordswap_path + else: + file_path = wordswap_directory / "singaporean.yaml" if not file_path.exists(): raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") - with file_path.open("r", encoding="utf-8") as f: - data = yaml.safe_load(f) - # ensure that wordswap YAML is in the correct format. + + try: + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as exc: + raise ValueError(f"Invalid YAML format in wordswap file: {file_path}") from exc + if not isinstance(data, dict): - raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions") + raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions.") + + for key, value in data.items(): + if not isinstance(key, str) or not isinstance(value, list) or len(value) == 0: + raise ValueError( + f"Invalid entry in wordswap YAML: key={key!r}. " + "Each key must be a string and each value a non-empty list of strings." + ) self._colloquial_substitutions = data @@ -81,13 +99,14 @@ def _build_identifier(self) -> ComponentIdentifier: return self._create_identifier( params={ "deterministic": self._deterministic, + "wordswap_path": self._wordswap_path, "substitution_keys": sorted(self._colloquial_substitutions.keys()), } ) async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: """ - Convert the given prompt by replacing words with colloquial Singaporean terms. + Convert the given prompt by replacing words with regional colloquial alternatives. Args: prompt (str): The input text prompt to be converted. diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index dbc9ed5709..d458421ee3 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -61,7 +61,22 @@ async def test_colloquial_non_deterministic(input_text): assert output_word == input_word -# Test for nondefault substitutions. +# Test for custom substitutions dict +@pytest.mark.asyncio +@pytest.mark.parametrize( + "input_text,custom_substitutions,expected_output", + [ + ("father", {"father": ["appa", "darth vader"]}, "appa"), + ("Hello world", {"hello": ["hi", "hey"], "world": ["earth"]}, "hi earth"), + ], +) +async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_substitutions) + result = await converter.convert_async(prompt=input_text) + assert result.output_text == expected_output + + +# Test for non-default wordswap YAML paths @pytest.mark.asyncio @pytest.mark.parametrize( "input_text,wordswap_path,expected_output", @@ -72,26 +87,36 @@ async def test_colloquial_non_deterministic(input_text): ("man", "multicultural_london.yaml", "mandem"), ], ) -async def test_colloquial_custom_substitutions(input_text, wordswap_path, expected_output): +async def test_colloquial_wordswap_path(input_text, wordswap_path, expected_output): converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output -# Test for empty wordswap_path +# Test multiple word prompts for non-default wordswap paths @pytest.mark.asyncio @pytest.mark.parametrize( - "input_text,expected_output", + "input_text,wordswap_path,expected_output", [ - ("mother and father", "mama and papa"), # Using default substitutions when custom is empty + ("father and mother", "indian.yaml", "papa and mummy"), + ("brother and sister", "southern_american.yaml", "bubba and sissy"), + ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), ], ) -async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=None) +async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output +# Test default behavior when no wordswap_path or custom_substitutions given +@pytest.mark.asyncio +async def test_colloquial_default_singaporean(): + converter = ColloquialWordswapConverter(deterministic=True) + result = await converter.convert_async(prompt="mother and father") + assert result.output_text == "mama and papa" + + # Test multiple word prompts @pytest.mark.asyncio @pytest.mark.parametrize( @@ -108,22 +133,6 @@ async def test_multiple_words(input_text, expected_output): assert result.output_text == expected_output -# Test multiple word prompts for custom colloquialism -@pytest.mark.asyncio -@pytest.mark.parametrize( - "input_text,wordswap_path,expected_output", - [ - ("father and mother", "indian.yaml", "papa and mummy"), - ("brother and sister", "southern_american.yaml", "bubba and sissy"), - ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), - ], -) -async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) - result = await converter.convert_async(prompt=input_text) - assert result.output_text == expected_output - - # Test for awkward spacing @pytest.mark.asyncio @pytest.mark.parametrize( @@ -160,17 +169,39 @@ def test_colloquial_converter_input_supported() -> None: assert converter.input_supported("image_path") is False -# Test that the constructor raises a ValueError when both custom_substitutions and wordswap_path are provided. def test_init_conflict_custom_substitutions_and_path(): with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"): ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml") -# test to check if direct dictionary of substitutions is passed and applies to prompt conversion correctly -@pytest.mark.asyncio -async def test_init_with_custom_substitutions_dict(): - custom_subs = {"hello": ["hi", "hey"], "world": ["earth"]} - # Use deterministic=True to ensure it picks the first item ("hi") for assertion - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions=custom_subs) - result = await converter.convert_async(prompt="Hello world") - assert result.output_text == "hi earth" +def test_init_missing_yaml_file(): + with pytest.raises(FileNotFoundError, match="Colloquial wordswap file not found"): + ColloquialWordswapConverter(wordswap_path="nonexistent.yaml") + + +def test_init_invalid_yaml_format(tmp_path): + bad_yaml = tmp_path / "bad.yaml" + bad_yaml.write_text("[not, a, dict]", encoding="utf-8") + with pytest.raises(ValueError, match="must be a dict"): + ColloquialWordswapConverter(wordswap_path=str(bad_yaml)) + + +def test_init_invalid_yaml_values(tmp_path): + bad_values = tmp_path / "bad_values.yaml" + bad_values.write_text('father: "not a list"', encoding="utf-8") + with pytest.raises(ValueError, match="Invalid entry in wordswap YAML"): + ColloquialWordswapConverter(wordswap_path=str(bad_values)) + + +def test_init_invalid_yaml_empty_list(tmp_path): + empty_list = tmp_path / "empty_list.yaml" + empty_list.write_text("father: []", encoding="utf-8") + with pytest.raises(ValueError, match="Invalid entry in wordswap YAML"): + ColloquialWordswapConverter(wordswap_path=str(empty_list)) + + +def test_init_malformed_yaml(tmp_path): + malformed = tmp_path / "malformed.yaml" + malformed.write_text("{{invalid yaml::", encoding="utf-8") + with pytest.raises(ValueError, match="Invalid YAML format"): + ColloquialWordswapConverter(wordswap_path=str(malformed)) From 6317509c9d2231ecda85044520b5d668c5b4f847 Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Thu, 26 Feb 2026 05:04:10 -0800 Subject: [PATCH 5/9] FIX: Preserve backward compatibility for deterministic and empty custom_substitutions - Keep deterministic as positional arg (reverting keyword-only change) - Empty custom_substitutions={} falls through to defaults as before - Add test for empty custom_substitutions behavior Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pyrit/prompt_converter/colloquial_wordswap_converter.py | 4 ++-- .../unit/converter/test_colloquial_wordswap_converter.py | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 6d9555a7ee..b96ed254ba 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -27,8 +27,8 @@ class ColloquialWordswapConverter(PromptConverter): def __init__( self, - *, deterministic: bool = False, + *, custom_substitutions: Optional[dict[str, list[str]]] = None, wordswap_path: Optional[str] = None, ) -> None: @@ -54,7 +54,7 @@ def __init__( self._wordswap_path = wordswap_path - if custom_substitutions is not None: + if custom_substitutions is not None and len(custom_substitutions) > 0: self._colloquial_substitutions = custom_substitutions else: wordswap_directory = CONVERTER_SEED_PROMPT_PATH / "colloquial_wordswaps" diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index d458421ee3..06684f0c71 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -117,6 +117,14 @@ async def test_colloquial_default_singaporean(): assert result.output_text == "mama and papa" +# Test that empty custom_substitutions falls through to defaults +@pytest.mark.asyncio +async def test_colloquial_empty_custom_substitutions(): + converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) + result = await converter.convert_async(prompt="mother and father") + assert result.output_text == "mama and papa" + + # Test multiple word prompts @pytest.mark.asyncio @pytest.mark.parametrize( From 0361db6710c6b121e4a5bbf4546acfa60d08cf53 Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Thu, 26 Feb 2026 05:08:23 -0800 Subject: [PATCH 6/9] MAINT: Deprecate positional 'deterministic' arg, keyword-only in 0.13.0 - Positional usage emits FutureWarning with migration guidance - Keyword usage works without warning - Add test for deprecation warning Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../colloquial_wordswap_converter.py | 12 +++++++++++- .../converter/test_colloquial_wordswap_converter.py | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index b96ed254ba..1f60c31709 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -4,6 +4,7 @@ import pathlib import random import re +import warnings from typing import Optional import yaml @@ -27,8 +28,8 @@ class ColloquialWordswapConverter(PromptConverter): def __init__( self, + *args, deterministic: bool = False, - *, custom_substitutions: Optional[dict[str, list[str]]] = None, wordswap_path: Optional[str] = None, ) -> None: @@ -49,6 +50,15 @@ def __init__( or if the YAML file has an invalid format. FileNotFoundError: If the specified wordswap YAML file does not exist. """ + if args: + warnings.warn( + "Passing 'deterministic' as a positional argument is deprecated. " + "Use deterministic=... as a keyword argument. " + "It will be keyword-only starting in version 0.13.0.", + FutureWarning, + stacklevel=2, + ) + deterministic = args[0] if custom_substitutions is not None and wordswap_path is not None: raise ValueError("Provide either custom_substitutions or wordswap_path, not both.") diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index 06684f0c71..8ae151eb6c 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -213,3 +213,8 @@ def test_init_malformed_yaml(tmp_path): malformed.write_text("{{invalid yaml::", encoding="utf-8") with pytest.raises(ValueError, match="Invalid YAML format"): ColloquialWordswapConverter(wordswap_path=str(malformed)) + + +def test_deterministic_positional_deprecation_warning(): + with pytest.warns(FutureWarning, match="positional argument is deprecated"): + ColloquialWordswapConverter(True) From 006f37d8948b78c477daa00f56b29be017b351d3 Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Thu, 26 Feb 2026 05:10:00 -0800 Subject: [PATCH 7/9] TEST: Add keyword-only test for deterministic deprecation Verify FutureWarning is only emitted for positional usage, not keyword. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- tests/unit/converter/test_colloquial_wordswap_converter.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index 8ae151eb6c..cb2eb6c8f6 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. import re +import warnings import pytest @@ -218,3 +219,9 @@ def test_init_malformed_yaml(tmp_path): def test_deterministic_positional_deprecation_warning(): with pytest.warns(FutureWarning, match="positional argument is deprecated"): ColloquialWordswapConverter(True) + + +def test_deterministic_keyword_no_deprecation_warning(): + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + ColloquialWordswapConverter(deterministic=True) From f3489db2887e36a5600444bc25a752c20ed7a4c3 Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Thu, 26 Feb 2026 05:42:51 -0800 Subject: [PATCH 8/9] FIX: Add type annotation to *args for mypy Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pyrit/prompt_converter/colloquial_wordswap_converter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 1f60c31709..6c8461ef4b 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -28,7 +28,7 @@ class ColloquialWordswapConverter(PromptConverter): def __init__( self, - *args, + *args: bool, deterministic: bool = False, custom_substitutions: Optional[dict[str, list[str]]] = None, wordswap_path: Optional[str] = None, From 336393fe6e6e9ad775a1de2e92ae9edbdf8d1aa0 Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Thu, 26 Feb 2026 06:05:54 -0800 Subject: [PATCH 9/9] FIX: Add *args docstring for ruff D417 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- pyrit/prompt_converter/colloquial_wordswap_converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index 6c8461ef4b..78a2f69228 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -37,6 +37,7 @@ def __init__( Initialize the converter with optional deterministic mode and substitutions source. Args: + *args: Deprecated positional argument for deterministic. Use deterministic=... instead. deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. custom_substitutions (Optional[dict[str, list[str]]]): A dictionary of custom substitutions