From 11dcee0cc8441c0225788c01e300cc3d4df24251 Mon Sep 17 00:00:00 2001 From: Arturo Bernal Date: Tue, 11 Mar 2025 16:44:27 +0100 Subject: [PATCH 1/2] Bug Fix: Align URIBuilder encoding with RFC 3986 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, URIBuilder relied on partial or inconsistent character sets for various components, causing valid sub-delims or other characters to be unnecessarily percent-encoded or left unencoded in the fragment, query, userinfo, reg-name, and path segments. This patch introduces dedicated BitSets for each URI component (userinfo, host/reg-name, path segments, query, and fragment) and updates URIBuilder to use them. As a result, characters like ':', '@', '/', and '?' remain unencoded in the fragment and query where allowed by RFC 3986, while certain sub-delimiters in the path and host are now percent-encoded for strictness. This ensures consistent, RFC 3986–compliant encoding across all URI components. --- .../org/apache/hc/core5/net/PercentCodec.java | 41 +++++++++++++++++++ .../org/apache/hc/core5/net/URIBuilder.java | 14 +++---- .../apache/hc/core5/net/TestURIBuilder.java | 29 +++++++++++++ 3 files changed, 77 insertions(+), 7 deletions(-) diff --git a/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java b/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java index 40ebc9cb81..a015d1e946 100644 --- a/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java +++ b/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java @@ -84,6 +84,16 @@ public class PercentCodec { URIC.or(UNRESERVED); } + static final BitSet FRAGMENT_SAFE = new BitSet(256); + static { + FRAGMENT_SAFE.or(UNRESERVED); + FRAGMENT_SAFE.or(SUB_DELIMS); + FRAGMENT_SAFE.set(':'); + FRAGMENT_SAFE.set('@'); + FRAGMENT_SAFE.set('/'); + FRAGMENT_SAFE.set('?'); + } + static final BitSet RFC5987_UNRESERVED = new BitSet(256); static { @@ -113,6 +123,37 @@ public class PercentCodec { RFC5987_UNRESERVED.set('~'); } + static final BitSet PCHAR = new BitSet(256); + static final BitSet USERINFO = new BitSet(256); + static final BitSet REG_NAME = new BitSet(256); + static final BitSet PATH_SEGMENT = new BitSet(256); + static final BitSet QUERY = new BitSet(256); + static final BitSet FRAGMENT = new BitSet(256); + + static { + PCHAR.or(UNRESERVED); + PCHAR.or(SUB_DELIMS); + PCHAR.set(':'); + PCHAR.set('@'); + USERINFO.or(UNRESERVED); + USERINFO.or(SUB_DELIMS); + USERINFO.set(':'); + REG_NAME.or(UNRESERVED); + REG_NAME.or(SUB_DELIMS); + REG_NAME.clear('!'); + PATH_SEGMENT.or(PCHAR); + QUERY.or(PCHAR); + QUERY.set('/'); + QUERY.set('?'); + FRAGMENT.or(PCHAR); + FRAGMENT.set('/'); + FRAGMENT.set('?'); + // Some sub-delims remain encoded (RFC 3986 allows them unencoded, but we choose to be strict). + PATH_SEGMENT.clear('('); + PATH_SEGMENT.clear(')'); + PATH_SEGMENT.clear('&'); + } + private static final int RADIX = 16; static void encode(final StringBuilder buf, final CharSequence content, final Charset charset, diff --git a/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java b/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java index 7151d7bd97..a0160ec41e 100644 --- a/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java +++ b/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java @@ -306,7 +306,7 @@ static void formatPath(final StringBuilder buf, final Iterable segments, if (i > 0 || !rootless) { buf.append(PATH_SEPARATOR); } - PercentCodec.encode(buf, segment, charset); + PercentCodec.encode(buf, segment, charset, PercentCodec.PATH_SEGMENT, false); i++; } } @@ -356,18 +356,18 @@ private String buildString() { } else if (this.userInfo != null) { final int idx = this.userInfo.indexOf(':'); if (idx != -1) { - PercentCodec.encode(sb, this.userInfo.substring(0, idx), this.charset); + PercentCodec.encode(sb, this.userInfo.substring(0, idx), this.charset, PercentCodec.USERINFO, false); sb.append(':'); - PercentCodec.encode(sb, this.userInfo.substring(idx + 1), this.charset); + PercentCodec.encode(sb, this.userInfo.substring(idx + 1), this.charset, PercentCodec.USERINFO, false); } else { - PercentCodec.encode(sb, this.userInfo, this.charset); + PercentCodec.encode(sb, this.userInfo, this.charset, PercentCodec.USERINFO, false); } sb.append("@"); } if (InetAddressUtils.isIPv6(this.host)) { sb.append("[").append(this.host).append("]"); } else { - PercentCodec.encode(sb, this.host, this.charset); + PercentCodec.encode(sb, this.host, this.charset, PercentCodec.REG_NAME, false); } if (this.port >= 0) { sb.append(":").append(this.port); @@ -391,14 +391,14 @@ private String buildString() { formatQuery(sb, this.queryParams, this.charset, false); } else if (this.query != null) { sb.append("?"); - PercentCodec.encode(sb, this.query, this.charset, PercentCodec.URIC, false); + PercentCodec.encode(sb, this.query, this.charset, PercentCodec.QUERY, false); } } if (this.encodedFragment != null) { sb.append("#").append(this.encodedFragment); } else if (this.fragment != null) { sb.append("#"); - PercentCodec.encode(sb, this.fragment, this.charset, PercentCodec.URIC, false); + PercentCodec.encode(sb, this.fragment, this.charset, PercentCodec.FRAGMENT, false); } return sb.toString(); } diff --git a/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java b/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java index 98f54f390e..5e6614f25d 100644 --- a/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java +++ b/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java @@ -997,4 +997,33 @@ void testSetPlusAsBlank() throws Exception { params = uriBuilder.getQueryParams(); Assertions.assertEquals("hello world", params.get(0).getValue()); } + + @Test + void testFragmentEncoding() throws Exception { + final String fragment = "frag ment:!@/?\""; + final String expectedEncodedFragment = "frag%20ment:!@/?%22"; + + final URI uri = new URIBuilder() + .setScheme("http") + .setHost("example.com") + .setFragment(fragment) + .build(); + + Assertions.assertEquals(expectedEncodedFragment, uri.getRawFragment()); + } + + @Test + void testCustomQueryEncoding() throws Exception { + final String query = "query param:!@/?\""; + final String expectedEncodedQuery = "query%20param:!@/?%22"; + + final URI uri = new URIBuilder() + .setScheme("http") + .setHost("example.com") + .setCustomQuery(query) + .build(); + + Assertions.assertEquals(expectedEncodedQuery, uri.getRawQuery()); + } + } From 62bd76d1b6a5e57408ac9e87d53590f5a0a205d3 Mon Sep 17 00:00:00 2001 From: Arturo Bernal Date: Fri, 14 Mar 2025 09:25:57 +0100 Subject: [PATCH 2/2] Added EncodingPolicy enum (ALL_RESERVED, RFC_3986) and switched query and fragment encoding to PercentCodec.FRAGMENT under RFC_3986 --- .../org/apache/hc/core5/net/PercentCodec.java | 15 --- .../org/apache/hc/core5/net/URIBuilder.java | 93 ++++++++++++++++--- .../apache/hc/core5/net/TestURIBuilder.java | 17 +--- 3 files changed, 81 insertions(+), 44 deletions(-) diff --git a/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java b/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java index a015d1e946..bdf0ec9c37 100644 --- a/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java +++ b/httpcore5/src/main/java/org/apache/hc/core5/net/PercentCodec.java @@ -84,16 +84,6 @@ public class PercentCodec { URIC.or(UNRESERVED); } - static final BitSet FRAGMENT_SAFE = new BitSet(256); - static { - FRAGMENT_SAFE.or(UNRESERVED); - FRAGMENT_SAFE.or(SUB_DELIMS); - FRAGMENT_SAFE.set(':'); - FRAGMENT_SAFE.set('@'); - FRAGMENT_SAFE.set('/'); - FRAGMENT_SAFE.set('?'); - } - static final BitSet RFC5987_UNRESERVED = new BitSet(256); static { @@ -140,7 +130,6 @@ public class PercentCodec { USERINFO.set(':'); REG_NAME.or(UNRESERVED); REG_NAME.or(SUB_DELIMS); - REG_NAME.clear('!'); PATH_SEGMENT.or(PCHAR); QUERY.or(PCHAR); QUERY.set('/'); @@ -148,10 +137,6 @@ public class PercentCodec { FRAGMENT.or(PCHAR); FRAGMENT.set('/'); FRAGMENT.set('?'); - // Some sub-delims remain encoded (RFC 3986 allows them unencoded, but we choose to be strict). - PATH_SEGMENT.clear('('); - PATH_SEGMENT.clear(')'); - PATH_SEGMENT.clear('&'); } private static final int RADIX = 16; diff --git a/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java b/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java index a0160ec41e..3925c6ff76 100644 --- a/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java +++ b/httpcore5/src/main/java/org/apache/hc/core5/net/URIBuilder.java @@ -34,6 +34,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.BitSet; import java.util.Collections; import java.util.LinkedList; import java.util.List; @@ -87,9 +88,36 @@ public static URIBuilder loopbackAddress() { private Charset charset; private String fragment; private String encodedFragment; + private EncodingPolicy encodingPolicy = EncodingPolicy.ALL_RESERVED; private boolean plusAsBlank; + /** + * Defines the encoding policy for URI components in {@link URIBuilder}. + * This enum controls how characters are percent-encoded when constructing a URI, + * allowing flexibility between strict encoding and RFC 3986-compliant behavior. + * + * @since 5.4 + */ + public enum EncodingPolicy { + /** + * Encodes all reserved characters, allowing only unreserved characters + * (ALPHA, DIGIT, "-", ".", "_", "~") to remain unencoded. This is a strict + * policy suitable for conservative URI production where maximum encoding + * is desired. + */ + ALL_RESERVED, + + /** + * Follows RFC 3986 component-specific encoding rules. For example, query and + * fragment components allow unreserved characters, sub-delimiters ("!", "$", + * "&", "'", "(", ")", "*", "+", ",", ";", "="), and additional characters + * (":", "@", "/", "?") to remain unencoded, as defined by {@code PercentCodec.FRAGMENT}. + * This policy ensures compliance with RFC 3986 while maintaining interoperability. + */ + RFC_3986 + } + /** * Constructs an empty instance. */ @@ -175,6 +203,22 @@ public URIBuilder setCharset(final Charset charset) { return this; } + /** + * Sets the encoding policy for this {@link URIBuilder}. + * The encoding policy determines how URI components (e.g., query, fragment) are + * percent-encoded when building the URI string. If not set, the default policy + * is {@link EncodingPolicy#RFC_3986}. + * + * @param encodingPolicy the encoding policy to apply, or {@code null} to reset + * to the default ({@link EncodingPolicy#ALL_RESERVED}) + * @return this {@link URIBuilder} instance for method chaining + * @since 5.4 + */ + public URIBuilder setEncodingPolicy(final EncodingPolicy encodingPolicy) { + this.encodingPolicy = encodingPolicy; + return this; + } + /** * Gets the authority. * @@ -300,33 +344,46 @@ static List parsePath(final CharSequence s, final Charset charset) { return list; } - static void formatPath(final StringBuilder buf, final Iterable segments, final boolean rootless, final Charset charset) { + static void formatPath(final StringBuilder buf, final Iterable segments, final boolean rootless, + final Charset charset, final BitSet safechars) { int i = 0; for (final String segment : segments) { if (i > 0 || !rootless) { buf.append(PATH_SEPARATOR); } - PercentCodec.encode(buf, segment, charset, PercentCodec.PATH_SEGMENT, false); + PercentCodec.encode(buf, segment, charset, safechars, false); i++; } } - static void formatQuery(final StringBuilder buf, final Iterable params, final Charset charset, - final boolean blankAsPlus) { + static void formatPath(final StringBuilder buf, final Iterable segments, final boolean rootless, + final Charset charset) { + formatPath(buf, segments, rootless, charset, PercentCodec.UNRESERVED); + } + + + static void formatQuery(final StringBuilder buf, final Iterable params, + final Charset charset, final BitSet safechars, final boolean blankAsPlus) { int i = 0; for (final NameValuePair parameter : params) { if (i > 0) { buf.append(QUERY_PARAM_SEPARATOR); } - PercentCodec.encode(buf, parameter.getName(), charset, blankAsPlus); + PercentCodec.encode(buf, parameter.getName(), charset, safechars, blankAsPlus); if (parameter.getValue() != null) { buf.append(PARAM_VALUE_SEPARATOR); - PercentCodec.encode(buf, parameter.getValue(), charset, blankAsPlus); + PercentCodec.encode(buf, parameter.getValue(), charset, safechars, blankAsPlus); } i++; } } + static void formatQuery(final StringBuilder buf, final Iterable params, + final Charset charset, final boolean blankAsPlus) { + formatQuery(buf, params, charset, PercentCodec.UNRESERVED, blankAsPlus); + } + + /** * Builds a {@link URI} instance. */ @@ -356,18 +413,22 @@ private String buildString() { } else if (this.userInfo != null) { final int idx = this.userInfo.indexOf(':'); if (idx != -1) { - PercentCodec.encode(sb, this.userInfo.substring(0, idx), this.charset, PercentCodec.USERINFO, false); + PercentCodec.encode(sb, this.userInfo.substring(0, idx), this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.UNRESERVED : PercentCodec.USERINFO, false); sb.append(':'); - PercentCodec.encode(sb, this.userInfo.substring(idx + 1), this.charset, PercentCodec.USERINFO, false); + PercentCodec.encode(sb, this.userInfo.substring(idx + 1), this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.UNRESERVED : PercentCodec.USERINFO, false); } else { - PercentCodec.encode(sb, this.userInfo, this.charset, PercentCodec.USERINFO, false); + PercentCodec.encode(sb, this.userInfo, this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.UNRESERVED : PercentCodec.USERINFO, false); } sb.append("@"); } if (InetAddressUtils.isIPv6(this.host)) { sb.append("[").append(this.host).append("]"); } else { - PercentCodec.encode(sb, this.host, this.charset, PercentCodec.REG_NAME, false); + PercentCodec.encode(sb, this.host, this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.UNRESERVED : PercentCodec.REG_NAME, false); } if (this.port >= 0) { sb.append(":").append(this.port); @@ -382,23 +443,27 @@ private String buildString() { } sb.append(this.encodedPath); } else if (this.pathSegments != null) { - formatPath(sb, this.pathSegments, !authoritySpecified && this.pathRootless, this.charset); + formatPath(sb, this.pathSegments, !authoritySpecified && this.pathRootless, this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.UNRESERVED : PercentCodec.PATH_SEGMENT); } if (this.encodedQuery != null) { sb.append("?").append(this.encodedQuery); } else if (this.queryParams != null && !this.queryParams.isEmpty()) { sb.append("?"); - formatQuery(sb, this.queryParams, this.charset, false); + formatQuery(sb, this.queryParams, this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.UNRESERVED : PercentCodec.QUERY, false); } else if (this.query != null) { sb.append("?"); - PercentCodec.encode(sb, this.query, this.charset, PercentCodec.QUERY, false); + PercentCodec.encode(sb, this.query, this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.URIC : PercentCodec.QUERY, false); } } if (this.encodedFragment != null) { sb.append("#").append(this.encodedFragment); } else if (this.fragment != null) { sb.append("#"); - PercentCodec.encode(sb, this.fragment, this.charset, PercentCodec.FRAGMENT, false); + PercentCodec.encode(sb, this.fragment, this.charset, + encodingPolicy == EncodingPolicy.ALL_RESERVED ? PercentCodec.URIC : PercentCodec.FRAGMENT, false); } return sb.toString(); } diff --git a/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java b/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java index 5e6614f25d..74e7a6016c 100644 --- a/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java +++ b/httpcore5/src/test/java/org/apache/hc/core5/net/TestURIBuilder.java @@ -171,7 +171,7 @@ void testFormatQuery() { @Test void testHierarchicalUri() throws Exception { final URI uri = new URI("http", "stuff", "localhost", 80, "/some stuff", "param=stuff", "fragment"); - final URIBuilder uribuilder = new URIBuilder(uri); + final URIBuilder uribuilder = new URIBuilder(uri).setEncodingPolicy(URIBuilder.EncodingPolicy.ALL_RESERVED); final URI result = uribuilder.build(); Assertions.assertEquals(new URI("http://stuff@localhost:80/some%20stuff?param=stuff#fragment"), result); } @@ -998,20 +998,6 @@ void testSetPlusAsBlank() throws Exception { Assertions.assertEquals("hello world", params.get(0).getValue()); } - @Test - void testFragmentEncoding() throws Exception { - final String fragment = "frag ment:!@/?\""; - final String expectedEncodedFragment = "frag%20ment:!@/?%22"; - - final URI uri = new URIBuilder() - .setScheme("http") - .setHost("example.com") - .setFragment(fragment) - .build(); - - Assertions.assertEquals(expectedEncodedFragment, uri.getRawFragment()); - } - @Test void testCustomQueryEncoding() throws Exception { final String query = "query param:!@/?\""; @@ -1021,6 +1007,7 @@ void testCustomQueryEncoding() throws Exception { .setScheme("http") .setHost("example.com") .setCustomQuery(query) + .setEncodingPolicy(URIBuilder.EncodingPolicy.RFC_3986) .build(); Assertions.assertEquals(expectedEncodedQuery, uri.getRawQuery());