From 97d3fcd6513320edf4195ba62940af3eb79ffd8f Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 3 Jul 2024 19:53:23 -0400
Subject: [PATCH 1/2] filterblock: Don't break when filtering on unexpected
 data

The previous code in this block did filtering assuming that all
samples had a value that was correct for the type. For example, when
filtering on an integer value, it assumed every row had a valid
integer, where it may instead have garbage.

This change introduces a new helper, _convert_dtype(), which properly
handles this condition. When the conversion fails on a `ValueError`
exception, it treats it as `None` instead of allowing the exception to
be raised up to the caller.

The fix was authored by Shiv in PR #72. I only pulled it out into a
standalone commit.

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: shiv <shivchander.s30@gmail.com>
---
 src/instructlab/sdg/filterblock.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/instructlab/sdg/filterblock.py b/src/instructlab/sdg/filterblock.py
index e6d4eb24..794a4071 100644
--- a/src/instructlab/sdg/filterblock.py
+++ b/src/instructlab/sdg/filterblock.py
@@ -20,13 +20,20 @@ def __init__(
         self.convert_dtype = convert_dtype
         self.num_procs = batch_kwargs.get("num_procs", 1)
 
+    def _convert_dtype(self, sample):
+        try:
+            sample[self.column_name] = self.convert_dtype(sample[self.column_name])
+        except ValueError as e:
+            logger.error(
+                "Error converting dtype: %s, filling with None to be filtered later", e
+            )
+            sample[self.column_name] = None
+        return sample
+
     def generate(self, samples) -> Dataset:
         if self.convert_dtype:
             samples = samples.map(
-                lambda x: {
-                    **x,
-                    self.column_name: self.convert_dtype(x[self.column_name]),
-                },
+                self._convert_dtype,
                 num_proc=self.num_procs,
             )
 

From 0de3f62d13a373ca133f8605e285b0b92af0a7ef Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 3 Jul 2024 19:56:45 -0400
Subject: [PATCH 2/2] Add unit tests for sdg.filterblock

This new file introduces some unit tests for the filterblock module.
A previous commit introduced a bug fix to ensure that unexpected data
does not cause this block to break. These tests exercise those fixes.
The tests introduced here will run automatically in CI and help ensure
that the problem does not reoccur.

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 tests/test_filterblock.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 tests/test_filterblock.py

diff --git a/tests/test_filterblock.py b/tests/test_filterblock.py
new file mode 100644
index 00000000..9ee47c65
--- /dev/null
+++ b/tests/test_filterblock.py
@@ -0,0 +1,31 @@
+# Standard
+from unittest.mock import patch
+import operator
+import unittest
+
+# Third Party
+from datasets import Dataset, Features, Value
+
+# First Party
+from instructlab.sdg.filterblock import FilterByValueBlock
+
+
+class TestFilterByValueBlock(unittest.TestCase):
+    def setUp(self):
+        self.block = FilterByValueBlock(
+            filter_column="age",
+            filter_value=30,
+            operation=operator.eq,
+            convert_dtype=int,
+        )
+        self.dataset = Dataset.from_dict(
+            {"age": ["25", "30", "35", "forty", "45"]},
+            features=Features({"age": Value("string")}),
+        )
+
+    @patch("instructlab.sdg.filterblock.logger")
+    def test_generate_mixed_types(self, mock_logger):
+        filtered_dataset = self.block.generate(self.dataset)
+        self.assertEqual(len(filtered_dataset), 1)
+        self.assertEqual(filtered_dataset["age"], [30])
+        mock_logger.error.assert_called()