diff --git a/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java b/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java index 3b6dc18112..7c560602da 100644 --- a/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java +++ b/application/src/main/java/org/togetherjava/tjbot/features/utils/LinkDetection.java @@ -4,27 +4,69 @@ import com.linkedin.urls.detection.UrlDetector; import com.linkedin.urls.detection.UrlDetectorOptions; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; import java.util.List; +import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.concurrent.CompletableFuture; /** - * Utility class to detect links. + * Utility methods for working with links inside arbitrary text. + * + *

+ * This class can: + *

+ * + *

+ * It is intentionally stateless and uses asynchronous HTTP requests to avoid blocking calling + * threads. */ + public class LinkDetection { + private static final HttpClient HTTP_CLIENT = HttpClient.newHttpClient(); /** - * Possible ways to filter a link. + * Default filters applied when extracting links from text. + * + *

+ * These filters intentionally ignore: + *

* - * @see LinkDetection + *

+ * This reduces false positives when scanning chat messages or source-code snippets. + */ + + private static final Set DEFAULT_FILTERS = + Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME); + + /** + * Filters that control which detected URLs are returned by {@link #extractLinks}. */ public enum LinkFilter { /** - * Filters links suppressed with {@literal }. + * Ignores URLs that are wrapped in angle brackets, e.g. {@code }. + * + *

+ * Such links are often intentionally suppressed in chat platforms. */ SUPPRESSED, /** - * Filters links that are not using http scheme. + * Ignores URLs that do not use the HTTP or HTTPS scheme. + * + *

+ * This helps avoid false positives such as {@code ftp://}, {@code file://}, or scheme-less + * matches. */ NON_HTTP_SCHEME } @@ -34,12 +76,26 @@ private LinkDetection() { } /** - * Extracts all links from the given content. + * Extracts HTTP(S) links from the given text. * - * @param content the content to search through - * @param filter the filters applied to the urls - * @return a list of all found links, can be empty + *

+ * The text is scanned using a URL detector, then filtered and normalized according to the + * provided {@link LinkFilter}s. + * + *

+ * Example: + * + *

{@code
+     * Set filters = Set.of(LinkFilter.SUPPRESSED, LinkFilter.NON_HTTP_SCHEME);
+     * extractLinks("Visit https://example.com and ", filters)
+     * // returns ["https://example.com"]
+     * }
+ * + * @param content the text to scan for links + * @param filter a set of filters controlling which detected links are returned + * @return a list of extracted links in the order they appear in the text */ + public static List extractLinks(String content, Set filter) { return new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect() .stream() @@ -49,15 +105,157 @@ public static List extractLinks(String content, Set filter) } /** - * Checks whether the given content contains a link. + * Checks whether the given text contains at least one detectable URL. * - * @param content the content to search through - * @return true if the content contains at least one link + *

+ * This method performs a lightweight detection only and does not apply any {@link LinkFilter}s. + * + * @param content the text to scan + * @return {@code true} if at least one URL-like pattern is detected */ + public static boolean containsLink(String content) { return !(new UrlDetector(content, UrlDetectorOptions.BRACKET_MATCH).detect().isEmpty()); } + /** + * Asynchronously checks whether a URL is considered broken. + * + *

+ * The check is performed in two steps: + *

    + *
  1. A {@code HEAD} request is sent first (cheap and fast)
  2. + *
  3. If that fails or returns an error, a {@code GET} request is used as a fallback
  4. + *
+ * + *

+ * A link is considered broken if: + *

    + *
  • The URL is malformed or unreachable
  • + *
  • The HTTP request fails with an exception
  • + *
  • The response status code is 4xx (client error) or 5xx (server error)
  • + *
+ * + *

+ * Successful responses (2xx) and redirects (3xx) are considered valid links. The response body + * is never inspected. + * + * @param url the URL to check + * @return a {@code CompletableFuture} completing with {@code true} if the link is broken, + * {@code false} otherwise + */ + + public static CompletableFuture isLinkBroken(String url) { + HttpRequest headRequest = HttpRequest.newBuilder(URI.create(url)) + .method("HEAD", HttpRequest.BodyPublishers.noBody()) + .build(); + + return HTTP_CLIENT.sendAsync(headRequest, HttpResponse.BodyHandlers.discarding()) + .thenApply(response -> { + int status = response.statusCode(); + // 2xx and 3xx are success, 4xx and 5xx are errors + return status >= 400; + }) + .exceptionally(ignored -> true) + .thenCompose(result -> { + if (!Boolean.TRUE.equals(result)) { + return CompletableFuture.completedFuture(false); + } + HttpRequest fallbackGetRequest = + HttpRequest.newBuilder(URI.create(url)).GET().build(); + return HTTP_CLIENT + .sendAsync(fallbackGetRequest, HttpResponse.BodyHandlers.discarding()) + .thenApply(resp -> resp.statusCode() >= 400) + .exceptionally(ignored -> true); + }); + } + + /** + * Replaces all broken HTTP(S) links in the given text. + * + *

+ * Each detected link is checked asynchronously using {@link #isLinkBroken(String)}. Only links + * confirmed as broken are replaced. Duplicate URLs are checked only once and all occurrences + * are replaced if found to be broken. + * + *

+ * This method does not block - all link checks are performed asynchronously and combined into a + * single {@code CompletableFuture}. + * + *

+ * Example: + * + *

{@code
+     * replaceDeadLinks("""
+     *           Test
+     *           http://deadlink/1
+     *           http://workinglink/1
+     *         """, "(broken link)")
+     * }
+ * + *

+ * Results in: + * + *

{@code
+     * Test
+     * (broken link)
+     * http://workinglink/1
+     * }
+ * + * @param text the input text containing URLs + * @param replacement the string used to replace broken links + * @return a {@code CompletableFuture} that completes with the modified text, or the original + * text if no broken links were found + */ + + + public static CompletableFuture replaceDeadLinks(String text, String replacement) { + List links = extractLinks(text, DEFAULT_FILTERS); + + if (links.isEmpty()) { + return CompletableFuture.completedFuture(text); + } + + List> deadLinkFutures = links.stream() + .distinct() + .map(link -> isLinkBroken(link) + .thenApply(isBroken -> Boolean.TRUE.equals(isBroken) ? link : null)) + + .toList(); + + return CompletableFuture.allOf(deadLinkFutures.toArray(new CompletableFuture[0])) + .thenApply(ignored -> deadLinkFutures.stream() + .map(CompletableFuture::join) + .filter(Objects::nonNull) + .toList()) + .thenApply(deadLinks -> { + String result = text; + for (String deadLink : deadLinks) { + result = result.replace(deadLink, replacement); + } + return result; + }); + } + + /** + * Converts a detected {@link Url} into a normalized link string. + * + *

+ * Applies the provided {@link LinkFilter}s: + *

    + *
  • {@link LinkFilter#SUPPRESSED} - filters URLs wrapped in angle brackets
  • + *
  • {@link LinkFilter#NON_HTTP_SCHEME} - filters non-HTTP(S) schemes
  • + *
+ * + *

+ * Additionally removes trailing punctuation such as commas or periods from the detected URL. + * + * @param url the detected URL + * @param filter active link filters to apply + * @return an {@link Optional} containing the normalized link, or {@code Optional.empty()} if + * the link should be filtered out + */ + private static Optional toLink(Url url, Set filter) { String raw = url.getOriginalUrl(); if (filter.contains(LinkFilter.SUPPRESSED) && raw.contains(">")) { @@ -76,8 +274,6 @@ private static Optional toLink(Url url, Set filter) { // Remove trailing punctuation link = link.substring(0, link.length() - 1); } - return Optional.of(link); } - }