From 97d3fcd6513320edf4195ba62940af3eb79ffd8f Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 3 Jul 2024 19:53:23 -0400 Subject: [PATCH 1/2] filterblock: Don't break when filtering on unexpected data The previous code in this block did filtering assuming that all samples had a value that was correct for the type. For example, when filtering on an integer value, it assumed every row had a valid integer, where it may instead have garbage. This change introduces a new helper, _convert_dtype(), which properly handles this condition. When the conversion fails on a `ValueError` exception, it treats it as `None` instead of allowing the exception to be raised up to the caller. The fix was authored by Shiv in PR #72. I only pulled it out into a standalone commit. Signed-off-by: Russell Bryant Co-authored-by: shiv --- src/instructlab/sdg/filterblock.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py index e6d4eb24..794a4071 100644 --- a/src/instructlab/sdg/filterblock.py +++ b/src/instructlab/sdg/filterblock.py @@ -20,13 +20,20 @@ def __init__( self.convert_dtype = convert_dtype self.num_procs = batch_kwargs.get("num_procs", 1) + def _convert_dtype(self, sample): + try: + sample[self.column_name] = self.convert_dtype(sample[self.column_name]) + except ValueError as e: + logger.error( + "Error converting dtype: %s, filling with None to be filtered later", e + ) + sample[self.column_name] = None + return sample + def generate(self, samples) -> Dataset: if self.convert_dtype: samples = samples.map( - lambda x: { - **x, - self.column_name: self.convert_dtype(x[self.column_name]), - }, + self._convert_dtype, num_proc=self.num_procs, ) From 0de3f62d13a373ca133f8605e285b0b92af0a7ef Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 3 Jul 2024 19:56:45 -0400 Subject: [PATCH 2/2] Add unit tests for sdg.filterblock This new file introduces some unit tests for the filterblock module. A previous commit introduced a bug fix to ensure that unexpected data does not cause this block to break. These tests exercise those fixes. The tests introduced here will run automatically in CI and help ensure that the problem does not reoccur. Signed-off-by: Russell Bryant --- tests/test_filterblock.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tests/test_filterblock.py diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py new file mode 100644 index 00000000..9ee47c65 --- /dev/null +++ b/tests/test_filterblock.py @@ -0,0 +1,31 @@ +# Standard +from unittest.mock import patch +import operator +import unittest + +# Third Party +from datasets import Dataset, Features, Value + +# First Party +from instructlab.sdg.filterblock import FilterByValueBlock + + +class TestFilterByValueBlock(unittest.TestCase): + def setUp(self): + self.block = FilterByValueBlock( + filter_column="age", + filter_value=30, + operation=operator.eq, + convert_dtype=int, + ) + self.dataset = Dataset.from_dict( + {"age": ["25", "30", "35", "forty", "45"]}, + features=Features({"age": Value("string")}), + ) + + @patch("instructlab.sdg.filterblock.logger") + def test_generate_mixed_types(self, mock_logger): + filtered_dataset = self.block.generate(self.dataset) + self.assertEqual(len(filtered_dataset), 1) + self.assertEqual(filtered_dataset["age"], [30]) + mock_logger.error.assert_called()