diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml new file mode 100644 index 0000000000..e7bbcb9ccb --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/filipino.yaml @@ -0,0 +1,15 @@ +father: ["papa", "itay", "tay", "dad", "erpat"] +mother: ["mama", "inay", "nay", "mom", "ermat"] +grandfather: ["lolo", "lo", "lolo paps"] +grandmother: ["lola", "la", "lola mams"] +girl: ["nene", "inday", "ganda"] +boy: ["totoy", "dodong", "pogi"] +son: ["anak", "junior", "boy"] +daughter: ["anak", "nene", "ga"] +aunty: ["tita", "tiya", "tante"] +aunt: ["tita", "tiyang"] +man: ["kuya", "boss", "pare", "lodi", "chong"] +woman: ["ate", "miss", "ganda", "marites"] +uncle: ["tito", "tiyo", "tito paps"] +sister: ["ate", "sis", "sissy"] +brother: ["kuya", "bro", "paps", "tol"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml new file mode 100644 index 0000000000..e0798310ac --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml @@ -0,0 +1,15 @@ +father: ["papa", "bauji", "abba", "pitaji", "dad", "pop"] +mother: ["mummy", "maa", "ammi", "mataji", "amman"] +grandfather: ["dada", "nana", "dadaji", "nanaji"] +grandmother: ["dadi", "nani", "dadiji", "naniji"] +girl: ["beti", "kudi", "larki", "gudiya"] +boy: ["beta", "munda", "larka", "chokra"] +son: ["beta", "ladla", "puttar"] +daughter: ["beti", "bitiya", "laado"] +aunty: ["aunty", "mausi", "bua", "chachi", "mami"] +aunt: ["aunty", "mausi", "bua", "chachi", "mami"] +man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"] +woman: ["behenji", "didiji", "bandi", "memsaab"] +uncle: ["uncle", "chacha", "mama", "fufa", "tauji"] +sister: ["didi", "behen", "behna", "sis"] +brother: ["bhai", "bhaiyya", "bro", "veer", "paji"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml new file mode 100644 index 0000000000..fad7b2f437 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/multicultural_london.yaml @@ -0,0 +1,15 @@ +father: ["pops", "old man", "dadman", "pa"] +mother: ["mumsy", "moms", "mummy", "ma"] +grandfather: ["grandad", "gramps"] +grandmother: ["nan", "nanna", "gran"] +girl: ["gyal", "ting", "shorty", "peng ting"] +boy: ["yute", "man", "lil man"] +son: ["yute", "junior", "son-son"] +daughter: ["princess", "baby girl"] +aunty: ["aunty", "tantie"] +aunt: ["aunty", "aunt"] +man: ["mandem", "man", "geezer", "bloke"] +woman: ["gyal", "lady", "madam"] +uncle: ["unc", "uncle"] +sister: ["sis", "sissy"] +brother: ["bruv", "bredrin", "fam", "blood", "brudda"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml new file mode 100644 index 0000000000..f64e94a972 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/singaporean.yaml @@ -0,0 +1,15 @@ +father: ["papa", "lao bei", "lim pei", "bapa", "appa"] +mother: ["mama", "amma", "ibu"] +grandfather: ["ah gong", "thatha", "dato"] +grandmother: ["ah ma", "patti", "nenek"] +girl: ["ah ger", "ponnu"] +boy: ["ah boy", "boi", "payyan"] +son: ["ah boy", "boi", "payyan"] +daughter: ["ah ger", "ponnu"] +aunty: ["makcik", "maami"] +aunt: ["makcik", "maami"] +man: ["ah beng", "shuai ge"] +woman: ["ah lian", "xiao mei"] +uncle: ["encik", "unker"] +sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"] +brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"] diff --git a/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml new file mode 100644 index 0000000000..4dfa65ec30 --- /dev/null +++ b/pyrit/datasets/prompt_converters/colloquial_wordswaps/southern_american.yaml @@ -0,0 +1,15 @@ +father: ["daddy", "pa", "pops", "old man"] +mother: ["mama", "momma", "ma"] +grandfather: ["pawpaw", "grandpappy", "gramps"] +grandmother: ["mawmaw", "meemaw", "nana", "grammy"] +girl: ["gal", "missy", "sugar"] +boy: ["bubba", "sonny", "boy"] +son: ["junior", "bub", "son"] +daughter: ["sissy", "honey", "sugar"] +aunty: ["auntie"] +aunt: ["auntie"] +man: ["fella", "hoss", "bubba"] +woman: ["ma'am", "lady", "missy"] +uncle: ["unk"] +sister: ["sissy", "sis"] +brother: ["bubba", "bro"] diff --git a/pyrit/prompt_converter/colloquial_wordswap_converter.py b/pyrit/prompt_converter/colloquial_wordswap_converter.py index ce17fe73a6..78a2f69228 100644 --- a/pyrit/prompt_converter/colloquial_wordswap_converter.py +++ b/pyrit/prompt_converter/colloquial_wordswap_converter.py @@ -1,10 +1,15 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. +import pathlib import random import re +import warnings from typing import Optional +import yaml + +from pyrit.common.path import CONVERTER_SEED_PROMPT_PATH from pyrit.identifiers import ComponentIdentifier from pyrit.models import PromptDataType from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter @@ -12,44 +17,87 @@ class ColloquialWordswapConverter(PromptConverter): """ - Converts text into colloquial Singaporean context. + Converts text by replacing words with regional colloquial alternatives. + + Supports loading substitutions from YAML files (e.g., Singaporean, Filipino, Indian) + or accepting a custom substitution dictionary. Defaults to Singaporean substitutions. """ SUPPORTED_INPUT_TYPES = ("text",) SUPPORTED_OUTPUT_TYPES = ("text",) def __init__( - self, deterministic: bool = False, custom_substitutions: Optional[dict[str, list[str]]] = None + self, + *args: bool, + deterministic: bool = False, + custom_substitutions: Optional[dict[str, list[str]]] = None, + wordswap_path: Optional[str] = None, ) -> None: """ - Initialize the converter with optional deterministic mode and custom substitutions. + Initialize the converter with optional deterministic mode and substitutions source. Args: + *args: Deprecated positional argument for deterministic. Use deterministic=... instead. deterministic (bool): If True, use the first substitution for each wordswap. If False, randomly choose a substitution for each wordswap. Defaults to False. - custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of custom substitutions to - override the defaults. Defaults to None. + custom_substitutions (Optional[dict[str, list[str]]]): A dictionary of custom substitutions + to override the defaults. Defaults to None. + wordswap_path (Optional[str]): Path to a YAML file containing word substitutions. + Can be a filename within the built-in colloquial_wordswaps directory (e.g., "filipino.yaml") + or an absolute path to a custom YAML file. Defaults to None (uses singaporean.yaml). + + Raises: + ValueError: If both custom_substitutions and wordswap_path are provided, + or if the YAML file has an invalid format. + FileNotFoundError: If the specified wordswap YAML file does not exist. """ - default_substitutions = { - "father": ["papa", "lao bei", "lim pei", "bapa", "appa"], - "mother": ["mama", "amma", "ibu"], - "grandfather": ["ah gong", "thatha", "dato"], - "grandmother": ["ah ma", "patti", "nenek"], - "girl": ["ah ger", "ponnu"], - "boy": ["ah boy", "boi", "payyan"], - "son": ["ah boy", "boi", "payyan"], - "daughter": ["ah ger", "ponnu"], - "aunt": ["makcik", "maami"], - "aunty": ["makcik", "maami"], - "man": ["ah beng", "shuai ge"], - "woman": ["ah lian", "xiao mei"], - "uncle": ["encik", "unker"], - "sister": ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"], - "brother": ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"], - } - - # Use custom substitutions if provided, otherwise default to the standard ones - self._colloquial_substitutions = custom_substitutions if custom_substitutions else default_substitutions + if args: + warnings.warn( + "Passing 'deterministic' as a positional argument is deprecated. " + "Use deterministic=... as a keyword argument. " + "It will be keyword-only starting in version 0.13.0.", + FutureWarning, + stacklevel=2, + ) + deterministic = args[0] + if custom_substitutions is not None and wordswap_path is not None: + raise ValueError("Provide either custom_substitutions or wordswap_path, not both.") + + self._wordswap_path = wordswap_path + + if custom_substitutions is not None and len(custom_substitutions) > 0: + self._colloquial_substitutions = custom_substitutions + else: + wordswap_directory = CONVERTER_SEED_PROMPT_PATH / "colloquial_wordswaps" + + if wordswap_path is not None: + file_path = pathlib.Path(wordswap_path) + if not file_path.is_absolute(): + file_path = wordswap_directory / wordswap_path + else: + file_path = wordswap_directory / "singaporean.yaml" + + if not file_path.exists(): + raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}") + + try: + with file_path.open("r", encoding="utf-8") as f: + data = yaml.safe_load(f) + except yaml.YAMLError as exc: + raise ValueError(f"Invalid YAML format in wordswap file: {file_path}") from exc + + if not isinstance(data, dict): + raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions.") + + for key, value in data.items(): + if not isinstance(key, str) or not isinstance(value, list) or len(value) == 0: + raise ValueError( + f"Invalid entry in wordswap YAML: key={key!r}. " + "Each key must be a string and each value a non-empty list of strings." + ) + + self._colloquial_substitutions = data + self._deterministic = deterministic def _build_identifier(self) -> ComponentIdentifier: @@ -62,13 +110,14 @@ def _build_identifier(self) -> ComponentIdentifier: return self._create_identifier( params={ "deterministic": self._deterministic, + "wordswap_path": self._wordswap_path, "substitution_keys": sorted(self._colloquial_substitutions.keys()), } ) async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: """ - Convert the given prompt by replacing words with colloquial Singaporean terms. + Convert the given prompt by replacing words with regional colloquial alternatives. Args: prompt (str): The input text prompt to be converted. diff --git a/tests/unit/converter/test_colloquial_wordswap_converter.py b/tests/unit/converter/test_colloquial_wordswap_converter.py index f8203511fa..cb2eb6c8f6 100644 --- a/tests/unit/converter/test_colloquial_wordswap_converter.py +++ b/tests/unit/converter/test_colloquial_wordswap_converter.py @@ -2,6 +2,7 @@ # Licensed under the MIT license. import re +import warnings import pytest @@ -61,12 +62,13 @@ async def test_colloquial_non_deterministic(input_text): assert output_word == input_word -# Test for custom substitutions +# Test for custom substitutions dict @pytest.mark.asyncio @pytest.mark.parametrize( "input_text,custom_substitutions,expected_output", [ - ("father", {"father": ["appa", "darth vader"]}, "appa"), # Custom substitution father -> appa + ("father", {"father": ["appa", "darth vader"]}, "appa"), + ("Hello world", {"hello": ["hi", "hey"], "world": ["earth"]}, "hi earth"), ], ) async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output): @@ -75,20 +77,55 @@ async def test_colloquial_custom_substitutions(input_text, custom_substitutions, assert result.output_text == expected_output -# Test for empty custom substitutions +# Test for non-default wordswap YAML paths @pytest.mark.asyncio @pytest.mark.parametrize( - "input_text,expected_output", + "input_text,wordswap_path,expected_output", [ - ("mother and father", "mama and papa"), # Using default substitutions when custom is empty + ("father", "filipino.yaml", "papa"), + ("woman", "indian.yaml", "behenji"), + ("son", "southern_american.yaml", "junior"), + ("man", "multicultural_london.yaml", "mandem"), ], ) -async def test_colloquial_empty_custom_substitutions(input_text, expected_output): - converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) +async def test_colloquial_wordswap_path(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) + result = await converter.convert_async(prompt=input_text) + assert result.output_text == expected_output + + +# Test multiple word prompts for non-default wordswap paths +@pytest.mark.asyncio +@pytest.mark.parametrize( + "input_text,wordswap_path,expected_output", + [ + ("father and mother", "indian.yaml", "papa and mummy"), + ("brother and sister", "southern_american.yaml", "bubba and sissy"), + ("aunt and uncle", "multicultural_london.yaml", "aunty and unc"), + ], +) +async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output): + converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path) result = await converter.convert_async(prompt=input_text) assert result.output_text == expected_output +# Test default behavior when no wordswap_path or custom_substitutions given +@pytest.mark.asyncio +async def test_colloquial_default_singaporean(): + converter = ColloquialWordswapConverter(deterministic=True) + result = await converter.convert_async(prompt="mother and father") + assert result.output_text == "mama and papa" + + +# Test that empty custom_substitutions falls through to defaults +@pytest.mark.asyncio +async def test_colloquial_empty_custom_substitutions(): + converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={}) + result = await converter.convert_async(prompt="mother and father") + assert result.output_text == "mama and papa" + + # Test multiple word prompts @pytest.mark.asyncio @pytest.mark.parametrize( @@ -139,3 +176,52 @@ def test_colloquial_converter_input_supported() -> None: converter = ColloquialWordswapConverter() assert converter.input_supported("text") is True assert converter.input_supported("image_path") is False + + +def test_init_conflict_custom_substitutions_and_path(): + with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"): + ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml") + + +def test_init_missing_yaml_file(): + with pytest.raises(FileNotFoundError, match="Colloquial wordswap file not found"): + ColloquialWordswapConverter(wordswap_path="nonexistent.yaml") + + +def test_init_invalid_yaml_format(tmp_path): + bad_yaml = tmp_path / "bad.yaml" + bad_yaml.write_text("[not, a, dict]", encoding="utf-8") + with pytest.raises(ValueError, match="must be a dict"): + ColloquialWordswapConverter(wordswap_path=str(bad_yaml)) + + +def test_init_invalid_yaml_values(tmp_path): + bad_values = tmp_path / "bad_values.yaml" + bad_values.write_text('father: "not a list"', encoding="utf-8") + with pytest.raises(ValueError, match="Invalid entry in wordswap YAML"): + ColloquialWordswapConverter(wordswap_path=str(bad_values)) + + +def test_init_invalid_yaml_empty_list(tmp_path): + empty_list = tmp_path / "empty_list.yaml" + empty_list.write_text("father: []", encoding="utf-8") + with pytest.raises(ValueError, match="Invalid entry in wordswap YAML"): + ColloquialWordswapConverter(wordswap_path=str(empty_list)) + + +def test_init_malformed_yaml(tmp_path): + malformed = tmp_path / "malformed.yaml" + malformed.write_text("{{invalid yaml::", encoding="utf-8") + with pytest.raises(ValueError, match="Invalid YAML format"): + ColloquialWordswapConverter(wordswap_path=str(malformed)) + + +def test_deterministic_positional_deprecation_warning(): + with pytest.warns(FutureWarning, match="positional argument is deprecated"): + ColloquialWordswapConverter(True) + + +def test_deterministic_keyword_no_deprecation_warning(): + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + ColloquialWordswapConverter(deterministic=True)