diff --git a/pcg_basic.c b/pcg_basic.c
index 8c2fd0d..9181c54 100644
--- a/pcg_basic.c
+++ b/pcg_basic.c
@@ -78,22 +78,6 @@ uint32_t pcg32_random()
 
 uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound)
 {
-    // To avoid bias, we need to make the range of the RNG a multiple of
-    // bound, which we do by dropping output less than a threshold.
-    // A naive scheme to calculate the threshold would be to do
-    //
-    //     uint32_t threshold = 0x100000000ull % bound;
-    //
-    // but 64-bit div/mod is slower than 32-bit div/mod (especially on
-    // 32-bit platforms).  In essence, we do
-    //
-    //     uint32_t threshold = (0x100000000ull-bound) % bound;
-    //
-    // because this version will calculate the same modulus, but the LHS
-    // value is less than 2^32.
-
-    uint32_t threshold = -bound % bound;
-
     // Uniformity guarantees that this loop will terminate.  In practice, it
     // should usually terminate quickly; on average (assuming all bounds are
     // equally likely), 82.25% of the time, we can expect it to require just
@@ -103,8 +87,15 @@ uint32_t pcg32_boundedrand_r(pcg32_random_t* rng, uint32_t bound)
     // is eliminated.
     for (;;) {
         uint32_t r = pcg32_random_r(rng);
-        if (r >= threshold)
-            return r % bound;
+
+        // Start of bound-multiple segment which holds r
+        uint32_t rint = r / bound * bound;
+
+        // Check if the end of that segment do not overflow,
+        // i.e. rint + bound <= 2^32 or
+        // rint <= 2^32 - bound = (uint32_t)(-bound)
+        if (rint <= (uint32_t)(-bound))
+            return r - rint;
     }
 }