Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
father: ["papa", "itay", "tay", "dad", "erpat"]
mother: ["mama", "inay", "nay", "mom", "ermat"]
grandfather: ["lolo", "lo", "lolo paps"]
grandmother: ["lola", "la", "lola mams"]
girl: ["nene", "inday", "ganda"]
boy: ["totoy", "dodong", "pogi"]
son: ["anak", "junior", "boy"]
daughter: ["anak", "nene", "ga"]
aunty: ["tita", "tiya", "tante"]
aunt: ["tita", "tiyang"]
man: ["kuya", "boss", "pare", "lodi", "chong"]
woman: ["ate", "miss", "ganda", "marites"]
uncle: ["tito", "tiyo", "tito paps"]
sister: ["ate", "sis", "sissy"]
brother: ["kuya", "bro", "paps", "tol"]
15 changes: 15 additions & 0 deletions pyrit/datasets/prompt_converters/colloquial_wordswaps/indian.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
father: ["papa", "bauji", "abba", "pitaji", "dad", "pop"]
mother: ["mummy", "maa", "ammi", "mataji", "amman"]
grandfather: ["dada", "nana", "dadaji", "nanaji"]
grandmother: ["dadi", "nani", "dadiji", "naniji"]
girl: ["beti", "kudi", "larki", "gudiya"]
boy: ["beta", "munda", "larka", "chokra"]
son: ["beta", "ladla", "puttar"]
daughter: ["beti", "bitiya", "laado"]
aunty: ["aunty", "mausi", "bua", "chachi", "mami"]
aunt: ["aunty", "mausi", "bua", "chachi", "mami"]
man: ["bhai", "bhaiyya", "uncle", "banda", "yaar"]
woman: ["behenji", "didiji", "bandi", "memsaab"]
uncle: ["uncle", "chacha", "mama", "fufa", "tauji"]
sister: ["didi", "behen", "behna", "sis"]
brother: ["bhai", "bhaiyya", "bro", "veer", "paji"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
father: ["pops", "old man", "dadman", "pa"]
mother: ["mumsy", "moms", "mummy", "ma"]
grandfather: ["grandad", "gramps"]
grandmother: ["nan", "nanna", "gran"]
girl: ["gyal", "ting", "shorty", "peng ting"]
boy: ["yute", "man", "lil man"]
son: ["yute", "junior", "son-son"]
daughter: ["princess", "baby girl"]
aunty: ["aunty", "tantie"]
aunt: ["aunty", "aunt"]
man: ["mandem", "man", "geezer", "bloke"]
woman: ["gyal", "lady", "madam"]
uncle: ["unc", "uncle"]
sister: ["sis", "sissy"]
brother: ["bruv", "bredrin", "fam", "blood", "brudda"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
father: ["papa", "lao bei", "lim pei", "bapa", "appa"]
mother: ["mama", "amma", "ibu"]
grandfather: ["ah gong", "thatha", "dato"]
grandmother: ["ah ma", "patti", "nenek"]
girl: ["ah ger", "ponnu"]
boy: ["ah boy", "boi", "payyan"]
son: ["ah boy", "boi", "payyan"]
daughter: ["ah ger", "ponnu"]
aunty: ["makcik", "maami"]
aunt: ["makcik", "maami"]
man: ["ah beng", "shuai ge"]
woman: ["ah lian", "xiao mei"]
uncle: ["encik", "unker"]
sister: ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"]
brother: ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"]
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
father: ["daddy", "pa", "pops", "old man"]
mother: ["mama", "momma", "ma"]
grandfather: ["pawpaw", "grandpappy", "gramps"]
grandmother: ["mawmaw", "meemaw", "nana", "grammy"]
girl: ["gal", "missy", "sugar"]
boy: ["bubba", "sonny", "boy"]
son: ["junior", "bub", "son"]
daughter: ["sissy", "honey", "sugar"]
aunty: ["auntie"]
aunt: ["auntie"]
man: ["fella", "hoss", "bubba"]
woman: ["ma'am", "lady", "missy"]
uncle: ["unk"]
sister: ["sissy", "sis"]
brother: ["bubba", "bro"]
101 changes: 75 additions & 26 deletions pyrit/prompt_converter/colloquial_wordswap_converter.py
Original file line number Diff line number Diff line change
@@ -1,55 +1,103 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import pathlib
import random
import re
import warnings
from typing import Optional

import yaml

from pyrit.common.path import CONVERTER_SEED_PROMPT_PATH
from pyrit.identifiers import ComponentIdentifier
from pyrit.models import PromptDataType
from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter


class ColloquialWordswapConverter(PromptConverter):
"""
Converts text into colloquial Singaporean context.
Converts text by replacing words with regional colloquial alternatives.

Supports loading substitutions from YAML files (e.g., Singaporean, Filipino, Indian)
or accepting a custom substitution dictionary. Defaults to Singaporean substitutions.
"""

SUPPORTED_INPUT_TYPES = ("text",)
SUPPORTED_OUTPUT_TYPES = ("text",)

def __init__(
self, deterministic: bool = False, custom_substitutions: Optional[dict[str, list[str]]] = None
self,
*args: bool,
deterministic: bool = False,
custom_substitutions: Optional[dict[str, list[str]]] = None,
wordswap_path: Optional[str] = None,
) -> None:
"""
Initialize the converter with optional deterministic mode and custom substitutions.
Initialize the converter with optional deterministic mode and substitutions source.

Args:
*args: Deprecated positional argument for deterministic. Use deterministic=... instead.
deterministic (bool): If True, use the first substitution for each wordswap.
If False, randomly choose a substitution for each wordswap. Defaults to False.
custom_substitutions (Optional[Dict[str, List[str]]], Optional): A dictionary of custom substitutions to
override the defaults. Defaults to None.
custom_substitutions (Optional[dict[str, list[str]]]): A dictionary of custom substitutions
to override the defaults. Defaults to None.
wordswap_path (Optional[str]): Path to a YAML file containing word substitutions.
Can be a filename within the built-in colloquial_wordswaps directory (e.g., "filipino.yaml")
or an absolute path to a custom YAML file. Defaults to None (uses singaporean.yaml).

Raises:
ValueError: If both custom_substitutions and wordswap_path are provided,
or if the YAML file has an invalid format.
FileNotFoundError: If the specified wordswap YAML file does not exist.
"""
default_substitutions = {
"father": ["papa", "lao bei", "lim pei", "bapa", "appa"],
"mother": ["mama", "amma", "ibu"],
"grandfather": ["ah gong", "thatha", "dato"],
"grandmother": ["ah ma", "patti", "nenek"],
"girl": ["ah ger", "ponnu"],
"boy": ["ah boy", "boi", "payyan"],
"son": ["ah boy", "boi", "payyan"],
"daughter": ["ah ger", "ponnu"],
"aunt": ["makcik", "maami"],
"aunty": ["makcik", "maami"],
"man": ["ah beng", "shuai ge"],
"woman": ["ah lian", "xiao mei"],
"uncle": ["encik", "unker"],
"sister": ["xjj", "jie jie", "zhezhe", "kaka", "akka", "thangatchi"],
"brother": ["bro", "boiboi", "di di", "xdd", "anneh", "thambi"],
}

# Use custom substitutions if provided, otherwise default to the standard ones
self._colloquial_substitutions = custom_substitutions if custom_substitutions else default_substitutions
if args:
warnings.warn(
"Passing 'deterministic' as a positional argument is deprecated. "
"Use deterministic=... as a keyword argument. "
"It will be keyword-only starting in version 0.13.0.",
FutureWarning,
stacklevel=2,
)
deterministic = args[0]
if custom_substitutions is not None and wordswap_path is not None:
raise ValueError("Provide either custom_substitutions or wordswap_path, not both.")

self._wordswap_path = wordswap_path

if custom_substitutions is not None and len(custom_substitutions) > 0:
self._colloquial_substitutions = custom_substitutions
else:
wordswap_directory = CONVERTER_SEED_PROMPT_PATH / "colloquial_wordswaps"

if wordswap_path is not None:
file_path = pathlib.Path(wordswap_path)
if not file_path.is_absolute():
file_path = wordswap_directory / wordswap_path
else:
file_path = wordswap_directory / "singaporean.yaml"

if not file_path.exists():
raise FileNotFoundError(f"Colloquial wordswap file not found: {file_path}")

try:
with file_path.open("r", encoding="utf-8") as f:
data = yaml.safe_load(f)
except yaml.YAMLError as exc:
raise ValueError(f"Invalid YAML format in wordswap file: {file_path}") from exc

if not isinstance(data, dict):
raise ValueError("Wordswap YAML must be a dict[str, list[str]] mapping words to substitutions.")

for key, value in data.items():
if not isinstance(key, str) or not isinstance(value, list) or len(value) == 0:
raise ValueError(
f"Invalid entry in wordswap YAML: key={key!r}. "
"Each key must be a string and each value a non-empty list of strings."
)

self._colloquial_substitutions = data

self._deterministic = deterministic

def _build_identifier(self) -> ComponentIdentifier:
Expand All @@ -62,13 +110,14 @@ def _build_identifier(self) -> ComponentIdentifier:
return self._create_identifier(
params={
"deterministic": self._deterministic,
"wordswap_path": self._wordswap_path,
"substitution_keys": sorted(self._colloquial_substitutions.keys()),
}
)

async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
"""
Convert the given prompt by replacing words with colloquial Singaporean terms.
Convert the given prompt by replacing words with regional colloquial alternatives.

Args:
prompt (str): The input text prompt to be converted.
Expand Down
100 changes: 93 additions & 7 deletions tests/unit/converter/test_colloquial_wordswap_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import re
import warnings

import pytest

Expand Down Expand Up @@ -61,12 +62,13 @@ async def test_colloquial_non_deterministic(input_text):
assert output_word == input_word


# Test for custom substitutions
# Test for custom substitutions dict
@pytest.mark.asyncio
@pytest.mark.parametrize(
"input_text,custom_substitutions,expected_output",
[
("father", {"father": ["appa", "darth vader"]}, "appa"), # Custom substitution father -> appa
("father", {"father": ["appa", "darth vader"]}, "appa"),
("Hello world", {"hello": ["hi", "hey"], "world": ["earth"]}, "hi earth"),
],
)
async def test_colloquial_custom_substitutions(input_text, custom_substitutions, expected_output):
Expand All @@ -75,20 +77,55 @@ async def test_colloquial_custom_substitutions(input_text, custom_substitutions,
assert result.output_text == expected_output


# Test for empty custom substitutions
# Test for non-default wordswap YAML paths
@pytest.mark.asyncio
@pytest.mark.parametrize(
"input_text,expected_output",
"input_text,wordswap_path,expected_output",
[
("mother and father", "mama and papa"), # Using default substitutions when custom is empty
("father", "filipino.yaml", "papa"),
("woman", "indian.yaml", "behenji"),
("son", "southern_american.yaml", "junior"),
("man", "multicultural_london.yaml", "mandem"),
],
)
async def test_colloquial_empty_custom_substitutions(input_text, expected_output):
converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={})
async def test_colloquial_wordswap_path(input_text, wordswap_path, expected_output):
converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path)
result = await converter.convert_async(prompt=input_text)
assert result.output_text == expected_output


# Test multiple word prompts for non-default wordswap paths
@pytest.mark.asyncio
@pytest.mark.parametrize(
"input_text,wordswap_path,expected_output",
[
("father and mother", "indian.yaml", "papa and mummy"),
("brother and sister", "southern_american.yaml", "bubba and sissy"),
("aunt and uncle", "multicultural_london.yaml", "aunty and unc"),
],
)
async def test_multiple_words_custom_colloquialisms(input_text, wordswap_path, expected_output):
converter = ColloquialWordswapConverter(deterministic=True, wordswap_path=wordswap_path)
result = await converter.convert_async(prompt=input_text)
assert result.output_text == expected_output


# Test default behavior when no wordswap_path or custom_substitutions given
@pytest.mark.asyncio
async def test_colloquial_default_singaporean():
converter = ColloquialWordswapConverter(deterministic=True)
result = await converter.convert_async(prompt="mother and father")
assert result.output_text == "mama and papa"


# Test that empty custom_substitutions falls through to defaults
@pytest.mark.asyncio
async def test_colloquial_empty_custom_substitutions():
converter = ColloquialWordswapConverter(deterministic=True, custom_substitutions={})
result = await converter.convert_async(prompt="mother and father")
assert result.output_text == "mama and papa"


# Test multiple word prompts
@pytest.mark.asyncio
@pytest.mark.parametrize(
Expand Down Expand Up @@ -139,3 +176,52 @@ def test_colloquial_converter_input_supported() -> None:
converter = ColloquialWordswapConverter()
assert converter.input_supported("text") is True
assert converter.input_supported("image_path") is False


def test_init_conflict_custom_substitutions_and_path():
with pytest.raises(ValueError, match="Provide either custom_substitutions or wordswap_path"):
ColloquialWordswapConverter(custom_substitutions={"foo": ["bar"]}, wordswap_path="some_file.yaml")


def test_init_missing_yaml_file():
with pytest.raises(FileNotFoundError, match="Colloquial wordswap file not found"):
ColloquialWordswapConverter(wordswap_path="nonexistent.yaml")


def test_init_invalid_yaml_format(tmp_path):
bad_yaml = tmp_path / "bad.yaml"
bad_yaml.write_text("[not, a, dict]", encoding="utf-8")
with pytest.raises(ValueError, match="must be a dict"):
ColloquialWordswapConverter(wordswap_path=str(bad_yaml))


def test_init_invalid_yaml_values(tmp_path):
bad_values = tmp_path / "bad_values.yaml"
bad_values.write_text('father: "not a list"', encoding="utf-8")
with pytest.raises(ValueError, match="Invalid entry in wordswap YAML"):
ColloquialWordswapConverter(wordswap_path=str(bad_values))


def test_init_invalid_yaml_empty_list(tmp_path):
empty_list = tmp_path / "empty_list.yaml"
empty_list.write_text("father: []", encoding="utf-8")
with pytest.raises(ValueError, match="Invalid entry in wordswap YAML"):
ColloquialWordswapConverter(wordswap_path=str(empty_list))


def test_init_malformed_yaml(tmp_path):
malformed = tmp_path / "malformed.yaml"
malformed.write_text("{{invalid yaml::", encoding="utf-8")
with pytest.raises(ValueError, match="Invalid YAML format"):
ColloquialWordswapConverter(wordswap_path=str(malformed))


def test_deterministic_positional_deprecation_warning():
with pytest.warns(FutureWarning, match="positional argument is deprecated"):
ColloquialWordswapConverter(True)


def test_deterministic_keyword_no_deprecation_warning():
with warnings.catch_warnings():
warnings.simplefilter("error", FutureWarning)
ColloquialWordswapConverter(deterministic=True)