Skip to content

Commit 2657098

Browse files
committed
gh-144001: Add ignorechars parameter to Base64 decoder
1 parent 7ca9e7a commit 2657098

File tree

6 files changed

+170
-18
lines changed

6 files changed

+170
-18
lines changed

Lib/base64.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def b64encode(s, altchars=None, *, wrapcol=0):
6262
return encoded
6363

6464

65-
def b64decode(s, altchars=None, validate=False):
65+
def b64decode(s, altchars=None, validate=False, *, ignorechars=None):
6666
"""Decode the Base64 encoded bytes-like object or ASCII string s.
6767
6868
Optional altchars must be a bytes-like object or ASCII string of length 2
@@ -79,13 +79,18 @@ def b64decode(s, altchars=None, validate=False):
7979
For more information about the strict base64 check, see:
8080
8181
https://docs.python.org/3.11/library/binascii.html#binascii.a2b_base64
82+
83+
Optional ignorechars must be a bytes-like object specifying characters to
84+
ignore during decoding. When provided, only characters in this set will be
85+
silently ignored; other non-base64 characters will cause a binascii.Error.
86+
When None (the default), the behavior is controlled by the validate parameter.
8287
"""
8388
s = _bytes_from_decode_data(s)
8489
if altchars is not None:
8590
altchars = _bytes_from_decode_data(altchars)
8691
assert len(altchars) == 2, repr(altchars)
8792
s = s.translate(bytes.maketrans(altchars, b'+/'))
88-
return binascii.a2b_base64(s, strict_mode=validate)
93+
return binascii.a2b_base64(s, strict_mode=validate, ignorechars=ignorechars)
8994

9095

9196
def standard_b64encode(s):

Lib/test/test_base64.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,47 @@ def test_b64decode_invalid_chars(self):
331331
self.assertEqual(base64.urlsafe_b64decode(b'++//'), res)
332332
self.assertEqual(base64.urlsafe_b64decode(b'--__'), res)
333333

334+
def test_b64decode_ignorechars(self):
335+
# gh-144001: Test ignorechars parameter
336+
eq = self.assertEqual
337+
338+
# Basic functionality: ignore whitespace characters
339+
eq(base64.b64decode(b'YWJj\n', ignorechars=b'\n'), b'abc')
340+
eq(base64.b64decode(b'YWJj\r\n', ignorechars=b'\r\n'), b'abc')
341+
eq(base64.b64decode(b'YWJj \t\n', ignorechars=b' \t\n'), b'abc')
342+
343+
# Multiple whitespace characters in data
344+
eq(base64.b64decode(b'YW Jj\nYW I=', ignorechars=b' \n'), b'abcab')
345+
346+
# ignorechars=b'' should reject all non-base64 characters
347+
with self.assertRaises(binascii.Error):
348+
base64.b64decode(b'YWJj\n', ignorechars=b'')
349+
with self.assertRaises(binascii.Error):
350+
base64.b64decode(b'YWJj ', ignorechars=b'')
351+
352+
# Characters not in ignorechars should raise error
353+
with self.assertRaises(binascii.Error):
354+
base64.b64decode(b'YWJj!', ignorechars=b'\n')
355+
with self.assertRaises(binascii.Error):
356+
base64.b64decode(b'YWJj@', ignorechars=b' \t\n')
357+
358+
# ignorechars with custom characters
359+
eq(base64.b64decode(b'YW|Jj', ignorechars=b'|'), b'abc')
360+
eq(base64.b64decode(b'YW#Jj', ignorechars=b'#'), b'abc')
361+
362+
# Valid base64 with ignorechars=None (default) should work
363+
eq(base64.b64decode(b'YWJj\n', ignorechars=None), b'abc')
364+
eq(base64.b64decode(b'YWJj!', ignorechars=None), b'abc')
365+
366+
# Test with altchars and ignorechars together
367+
eq(base64.b64decode(b'YW-j\n', altchars=b'-_', ignorechars=b'\n'), b'ao\xa3')
368+
369+
# Test string input
370+
eq(base64.b64decode('YWJj\n', ignorechars=b'\n'), b'abc')
371+
372+
# Test that ignorechars accepts various bytes-like objects
373+
eq(base64.b64decode(b'YWJj\n', ignorechars=bytearray(b'\n')), b'abc')
374+
334375
def _altchars_strategy():
335376
"""Generate 'altchars' for base64 encoding."""
336377
reserved_chars = (string.digits + string.ascii_letters + "=").encode()

Lib/test/test_binascii.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,45 @@ def assertExcessPadding(data, non_strict_mode_expected_result: bytes):
176176
assertExcessPadding(b'abcd====', b'i\xb7\x1d')
177177
assertExcessPadding(b'abcd=====', b'i\xb7\x1d')
178178

179+
def test_base64_ignorechars(self):
180+
# gh-144001: Test ignorechars parameter for a2b_base64
181+
a2b = binascii.a2b_base64
182+
type2test = self.type2test
183+
184+
# Basic functionality: ignore specified characters
185+
self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=b'\n'), b'abc')
186+
self.assertEqual(a2b(type2test(b'YWJj\r\n'), ignorechars=b'\r\n'), b'abc')
187+
self.assertEqual(a2b(type2test(b'YWJj \t\n'), ignorechars=b' \t\n'), b'abc')
188+
189+
# Multiple ignored characters in data
190+
self.assertEqual(a2b(type2test(b'YW Jj\nYW I='), ignorechars=b' \n'), b'abcab')
191+
192+
# ignorechars=b'' should reject all non-base64 characters
193+
with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'):
194+
a2b(type2test(b'YWJj\n'), ignorechars=b'')
195+
with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'):
196+
a2b(type2test(b'YWJj '), ignorechars=b'')
197+
198+
# Characters not in ignorechars should raise error
199+
with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'):
200+
a2b(type2test(b'YWJj!'), ignorechars=b'\n')
201+
with self.assertRaisesRegex(binascii.Error, r'(?i)Only base64 data'):
202+
a2b(type2test(b'YWJj@'), ignorechars=b' \t\n')
203+
204+
# ignorechars with custom characters
205+
self.assertEqual(a2b(type2test(b'YW|Jj'), ignorechars=b'|'), b'abc')
206+
self.assertEqual(a2b(type2test(b'YW#Jj'), ignorechars=b'#'), b'abc')
207+
208+
# ignorechars=None should use default behavior (ignore all non-base64)
209+
self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=None), b'abc')
210+
self.assertEqual(a2b(type2test(b'YWJj!'), ignorechars=None), b'abc')
211+
212+
# Test interaction with strict_mode
213+
# When both are used, ignorechars takes precedence for character filtering
214+
self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=b'\n', strict_mode=False), b'abc')
215+
216+
# Test that ignorechars accepts various bytes-like objects
217+
self.assertEqual(a2b(type2test(b'YWJj\n'), ignorechars=bytearray(b'\n')), b'abc')
179218

180219
def test_base64errors(self):
181220
# Test base64 with invalid padding
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
Add the ``ignorechars`` parameter to :func:`binascii.a2b_base64` and
2+
:func:`base64.b64decode`. When provided, only characters in this set will be
3+
silently ignored during decoding; other non-base64 characters will cause an
4+
error. This allows selective filtering of characters (e.g., ignoring
5+
whitespace while rejecting other invalid characters), similar to the existing
6+
``ignorechars`` parameter in :func:`base64.a85decode`.

Modules/binascii.c

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -477,17 +477,24 @@ binascii.a2b_base64
477477
/
478478
*
479479
strict_mode: bool = False
480+
ignorechars: object = None
480481
481482
Decode a line of base64 data.
482483
483484
strict_mode
484485
When set to True, bytes that are not part of the base64 standard are not allowed.
485486
The same applies to excess data after padding (= / ==).
487+
ignorechars
488+
A bytes-like object specifying characters to ignore during decoding.
489+
When provided, only characters in this set will be silently ignored;
490+
other non-base64 characters will cause an error. When None (the default),
491+
all non-base64 characters are silently ignored (unless strict_mode is True).
486492
[clinic start generated code]*/
487493

488494
static PyObject *
489-
binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
490-
/*[clinic end generated code: output=5409557788d4f975 input=13c797187acc9c40]*/
495+
binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode,
496+
PyObject *ignorechars)
497+
/*[clinic end generated code: output=7d2b92b6f1de3ccc input=485946ff2e8960c6]*/
491498
{
492499
assert(data->len >= 0);
493500

@@ -496,10 +503,30 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
496503
binascii_state *state = NULL;
497504
char padding_started = 0;
498505

506+
/* Handle ignorechars parameter */
507+
Py_buffer ignorechars_buf = {0};
508+
int has_ignorechars = 0;
509+
unsigned char ignorechars_table[256] = {0}; /* Lookup table for ignored chars */
510+
511+
if (ignorechars != Py_None) {
512+
if (PyObject_GetBuffer(ignorechars, &ignorechars_buf, PyBUF_SIMPLE) < 0) {
513+
return NULL;
514+
}
515+
has_ignorechars = 1;
516+
/* Build lookup table for O(1) character checking */
517+
const unsigned char *ic = (const unsigned char *)ignorechars_buf.buf;
518+
for (Py_ssize_t j = 0; j < ignorechars_buf.len; j++) {
519+
ignorechars_table[ic[j]] = 1;
520+
}
521+
}
522+
499523
/* Allocate the buffer */
500524
Py_ssize_t bin_len = ((ascii_len+3)/4)*3; /* Upper bound, corrected later */
501525
PyBytesWriter *writer = PyBytesWriter_Create(bin_len);
502526
if (writer == NULL) {
527+
if (has_ignorechars) {
528+
PyBuffer_Release(&ignorechars_buf);
529+
}
503530
return NULL;
504531
}
505532
unsigned char *bin_data = PyBytesWriter_GetData(writer);
@@ -517,8 +544,9 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
517544
/* Fast path: use optimized decoder for complete quads.
518545
* This works for both strict and non-strict mode for valid input.
519546
* The fast path stops at padding, invalid chars, or incomplete groups.
547+
* Skip fast path when ignorechars is provided, as we need to check each char.
520548
*/
521-
if (ascii_len >= 4) {
549+
if (ascii_len >= 4 && !has_ignorechars) {
522550
Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
523551
bin_data, table_a2b_base64);
524552
if (fast_chars > 0) {
@@ -533,6 +561,7 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
533561
int pads = 0;
534562
for (; i < ascii_len; i++) {
535563
unsigned char this_ch = ascii_data[i];
564+
unsigned char orig_ch = this_ch; /* Save original for ignorechars check */
536565

537566
/* Check for pad sequences and ignore
538567
** the invalid ones.
@@ -567,7 +596,20 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
567596

568597
this_ch = table_a2b_base64[this_ch];
569598
if (this_ch >= 64) {
570-
if (strict_mode) {
599+
/* Non-base64 character found */
600+
if (has_ignorechars) {
601+
/* When ignorechars is provided, only skip if char is in the set */
602+
if (ignorechars_table[orig_ch]) {
603+
continue; /* Character is in ignorechars, skip it */
604+
}
605+
/* Character not in ignorechars, raise error */
606+
state = get_binascii_state(module);
607+
if (state) {
608+
PyErr_SetString(state->Error, "Only base64 data is allowed");
609+
}
610+
goto error_end;
611+
}
612+
else if (strict_mode) {
571613
state = get_binascii_state(module);
572614
if (state) {
573615
PyErr_SetString(state->Error, "Only base64 data is allowed");
@@ -634,9 +676,15 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
634676
}
635677

636678
done:
679+
if (has_ignorechars) {
680+
PyBuffer_Release(&ignorechars_buf);
681+
}
637682
return PyBytesWriter_FinishWithPointer(writer, bin_data);
638683

639684
error_end:
685+
if (has_ignorechars) {
686+
PyBuffer_Release(&ignorechars_buf);
687+
}
640688
PyBytesWriter_Discard(writer);
641689
return NULL;
642690
}

Modules/clinic/binascii.c.h

Lines changed: 25 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)