Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions Doc/whatsnew/3.15.rst
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,12 @@ argparse
inline code when color output is enabled.
(Contributed by Savannah Ostrowski in :gh:`142390`.)

base64 & binascii
-----------------

* CPython's underlying base64 implementation now encodes 2x faster and decodes 3x
faster thanks to simple CPU pipelining optimizations.

calendar
--------

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The base64 implementation behind the :mod:`binascii`, :mod:`base64`, and
related codec has been optimized for modern pipelined CPU architectures and
now performs 2-3x faster across all platforms.
155 changes: 133 additions & 22 deletions Modules/binascii.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ get_binascii_state(PyObject *module)
}


static const unsigned char table_a2b_base64[] = {
/* Align to 64 bytes to ensure table fits in a single L1 cache line */
static const unsigned char table_a2b_base64[] Py_ALIGNED(64) = {
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
-1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
Expand All @@ -101,9 +102,101 @@ static const unsigned char table_a2b_base64[] = {
/* Max binary chunk size; limited only by available memory */
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)

static const unsigned char table_b2a_base64[] =
/*
* Fast base64 encoding/decoding helpers.
*
* Process complete groups without loop-carried dependencies.
*/

/* Align to 64 bytes to ensure table fits in a single L1 cache line */
static const unsigned char table_b2a_base64[] Py_ALIGNED(64) =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

/* Encode 3 bytes into 4 base64 characters. */
static inline void
base64_encode_trio(const unsigned char *in, unsigned char *out,
const unsigned char *table)
{
unsigned int combined = ((unsigned int)in[0] << 16) |
((unsigned int)in[1] << 8) |
(unsigned int)in[2];
out[0] = table[(combined >> 18) & 0x3f];
out[1] = table[(combined >> 12) & 0x3f];
out[2] = table[(combined >> 6) & 0x3f];
out[3] = table[combined & 0x3f];
}

/* Encode multiple complete 3-byte groups.
* Returns the number of input bytes processed (always a multiple of 3).
*/
static inline Py_ssize_t
base64_encode_fast(const unsigned char *in, Py_ssize_t in_len,
unsigned char *out, const unsigned char *table)
{
Py_ssize_t n_trios = in_len / 3;
Py_ssize_t i;

for (i = 0; i < n_trios; i++) {
base64_encode_trio(in + i * 3, out + i * 4, table);
}

return n_trios * 3;
}

/* Decode 4 base64 characters into 3 bytes.
* Returns 1 on success, 0 if any character is invalid.
*/
static inline int
base64_decode_quad(const unsigned char *in, unsigned char *out,
const unsigned char *table)
{
unsigned char v0 = table[in[0]];
unsigned char v1 = table[in[1]];
unsigned char v2 = table[in[2]];
unsigned char v3 = table[in[3]];

if ((v0 | v1 | v2 | v3) & 0xc0) {
return 0;
}

out[0] = (v0 << 2) | (v1 >> 4);
out[1] = (v1 << 4) | (v2 >> 2);
out[2] = (v2 << 6) | v3;
return 1;
}

/* Decode multiple complete 4-character groups (no padding allowed).
* Returns the number of input characters processed.
* Stops at the first invalid character, padding, or incomplete group.
*/
static inline Py_ssize_t
base64_decode_fast(const unsigned char *in, Py_ssize_t in_len,
unsigned char *out, const unsigned char *table)
{
Py_ssize_t n_quads = in_len / 4;
Py_ssize_t i;

for (i = 0; i < n_quads; i++) {
const unsigned char *inp = in + i * 4;

/* Check for padding - exit fast path to handle it properly.
* Four independent comparisons lets the compiler choose the optimal
* approach; on modern pipelined CPUs this is faster than bitmask tricks
* like XOR+SUB+AND for zero-detection which have data dependencies.
*/
if (inp[0] == BASE64_PAD || inp[1] == BASE64_PAD ||
inp[2] == BASE64_PAD || inp[3] == BASE64_PAD) {
break;
}

if (!base64_decode_quad(inp, out + i * 3, table)) {
break;
}
}

return i * 4;
}


static const unsigned short crctab_hqx[256] = {
0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7,
Expand Down Expand Up @@ -403,10 +496,26 @@ binascii_a2b_base64_impl(PyObject *module, Py_buffer *data, int strict_mode)
goto error_end;
}

size_t i = 0; /* Current position in input */

/* Fast path: use optimized decoder for complete quads.
* This works for both strict and non-strict mode for valid input.
* The fast path stops at padding, invalid chars, or incomplete groups.
*/
if (ascii_len >= 4) {
Py_ssize_t fast_chars = base64_decode_fast(ascii_data, (Py_ssize_t)ascii_len,
bin_data, table_a2b_base64);
if (fast_chars > 0) {
i = (size_t)fast_chars;
bin_data += (fast_chars / 4) * 3;
}
}

/* Slow path: handle remaining input (padding, invalid chars, partial groups) */
int quad_pos = 0;
unsigned char leftchar = 0;
int pads = 0;
for (size_t i = 0; i < ascii_len; i++) {
for (; i < ascii_len; i++) {
unsigned char this_ch = ascii_data[i];

/* Check for pad sequences and ignore
Expand Down Expand Up @@ -533,9 +642,6 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
/*[clinic end generated code: output=4ad62c8e8485d3b3 input=0e20ff59c5f2e3e1]*/
{
const unsigned char *bin_data;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t bin_len;
binascii_state *state;

Expand Down Expand Up @@ -566,26 +672,31 @@ binascii_b2a_base64_impl(PyObject *module, Py_buffer *data, int newline)
}
unsigned char *ascii_data = PyBytesWriter_GetData(writer);

for( ; bin_len > 0 ; bin_len--, bin_data++ ) {
/* Shift the data into our buffer */
leftchar = (leftchar << 8) | *bin_data;
leftbits += 8;

/* See if there are 6-bit groups ready */
while ( leftbits >= 6 ) {
this_ch = (leftchar >> (leftbits-6)) & 0x3f;
leftbits -= 6;
*ascii_data++ = table_b2a_base64[this_ch];
}
}
if ( leftbits == 2 ) {
*ascii_data++ = table_b2a_base64[(leftchar&3) << 4];
/* Use the optimized fast path for complete 3-byte groups */
Py_ssize_t fast_bytes = base64_encode_fast(bin_data, bin_len, ascii_data,
table_b2a_base64);
bin_data += fast_bytes;
ascii_data += (fast_bytes / 3) * 4;
bin_len -= fast_bytes;

/* Handle remaining 0-2 bytes */
if (bin_len == 1) {
/* 1 byte remaining: produces 2 base64 chars + 2 padding */
unsigned int val = bin_data[0];
*ascii_data++ = table_b2a_base64[(val >> 2) & 0x3f];
*ascii_data++ = table_b2a_base64[(val << 4) & 0x3f];
*ascii_data++ = BASE64_PAD;
*ascii_data++ = BASE64_PAD;
} else if ( leftbits == 4 ) {
*ascii_data++ = table_b2a_base64[(leftchar&0xf) << 2];
}
else if (bin_len == 2) {
/* 2 bytes remaining: produces 3 base64 chars + 1 padding */
unsigned int val = ((unsigned int)bin_data[0] << 8) | bin_data[1];
*ascii_data++ = table_b2a_base64[(val >> 10) & 0x3f];
*ascii_data++ = table_b2a_base64[(val >> 4) & 0x3f];
*ascii_data++ = table_b2a_base64[(val << 2) & 0x3f];
*ascii_data++ = BASE64_PAD;
}

if (newline)
*ascii_data++ = '\n'; /* Append a courtesy newline */

Expand Down
Loading
Loading