diff --git a/robots.cc b/robots.cc index bdbccea..12caaff 100644 --- a/robots.cc +++ b/robots.cc @@ -21,7 +21,7 @@ // with Google-specific optimizations detailed at // https://developers.google.com/search/reference/robots_txt -#include "robots.h" +#include "./robots.h" #include diff --git a/robots.h b/robots.h index adccef5..f22038b 100644 --- a/robots.h +++ b/robots.h @@ -112,7 +112,9 @@ class RobotsMatcher : protected RobotsParseHandler { static bool IsValidUserAgentToObey(absl::string_view user_agent); // Returns true iff 'url' is allowed to be fetched by any member of the - // "user_agents" vector. 'url' must be %-encoded according to RFC3986. + // "user_agents" vector after collapsing all rules applying to any member of + // the "user_agents" vector into a single ruleset. 'url' must be %-encoded + // according to RFC3986. bool AllowedByRobots(absl::string_view robots_body, const std::vector* user_agents, const std::string& url); diff --git a/robots_main.cc b/robots_main.cc index 0e60f02..a96fffa 100644 --- a/robots_main.cc +++ b/robots_main.cc @@ -16,26 +16,30 @@ // File: robots_main.cc // ----------------------------------------------------------------------------- // -// Simple binary to assess whether a URL is accessible to a user-agent according -// to records found in a local robots.txt file, based on Google's robots.txt -// parsing and matching algorithms. +// Simple binary to assess whether a URL is accessible to a set of user-agents +// according to records found in a local robots.txt file, based on Google's +// robots.txt parsing and matching algorithms. // Usage: // robots_main // Arguments: // local_path_to_robotstxt: local path to a file containing robots.txt records. // For example: /home/users/username/robots.txt -// user_agent: a token to be matched against records in the robots.txt. -// For example: Googlebot +// user_agent: a token to be matched against records in the robots.txt (or a +// comma-separated list of user agents) +// For example: Googlebot or Googlebot,Googlebot-image // url: a url to be matched against records in the robots.txt. The URL must be // %-encoded according to RFC3986. // For example: https://example.com/accessible/url.html // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed -// to access 'url' based on records in 'local_path_to_robotstxt'. +// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple +// user agents are provided, check them as a vector based on functionality in +// "AllowedByRobots" method. // #include #include +#include -#include "robots.h" +#include "./robots.h" bool LoadFile(const std::string& filename, std::string* result) { std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate); @@ -86,13 +90,18 @@ int main(int argc, char** argv) { return 1; } - std::string user_agent = argv[2]; - std::vector user_agents(1, user_agent); + std::string input_useragents = argv[2]; + std::vector useragents; + std::string ua; + std::istringstream ss(input_useragents); + while (std::getline(ss, ua, ',')) { + useragents.push_back(ua); + } googlebot::RobotsMatcher matcher; std::string url = argv[3]; - bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url); + bool allowed = matcher.AllowedByRobots(robots_content, &useragents, url); - std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3] + std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3] << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl; if (robots_content.empty()) { std::cout << "notice: robots file is empty so all user-agents are allowed" diff --git a/robots_test.cc b/robots_test.cc index 369efcd..7e64e1f 100644 --- a/robots_test.cc +++ b/robots_test.cc @@ -15,7 +15,7 @@ // This file tests the robots.txt parsing and matching code found in robots.cc // against the current Robots Exclusion Protocol (REP) internet draft (I-D). // https://tools.ietf.org/html/draft-koster-rep -#include "robots.h" +#include "./robots.h" #include @@ -33,6 +33,19 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt, return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url); } +bool AllowedByRobots(const absl::string_view robotstxt, + const std::string& input_useragents, + const std::string& url) { + std::vector useragents; + std::string ua; + std::istringstream ss(input_useragents); + while (std::getline(ss, ua, ',')) { + useragents.push_back(ua); + } + RobotsMatcher matcher; + return matcher.AllowedByRobots(robotstxt, &useragents, url); +} + // Google-specific: system test. TEST(RobotsUnittest, GoogleOnly_SystemTest) { const absl::string_view robotstxt = @@ -123,6 +136,34 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) { EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo)); } +// Test based on the documentation at +// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents +// "Only one group is valid for a particular crawler" +// "The group followed is group 1. Only the most specific group is followed, +// all others are ignored" +TEST(RobotsUnittest, ID_Multiple_Useragents) { + const absl::string_view robotstxt = + "user-agent: googlebot-news\n" + "Disallow: /bar/\n" + "\n" + "user-agent: *\n" + "Disallow: /baz/\n" + "\n\n" + "user-agent: googlebot\n" + "Disallow: /foo/\n"; + + const std::string url_foo = "http://foo.bar/foo/"; + const std::string url_bar = "http://foo.bar/bar/"; + const std::string url_baz = "http://foo.bar/baz/"; + const std::string url_qux = "http://foo.bar/qux/"; + + // the first test currently fails + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo)); + EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar)); + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz)); + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux)); +} + // REP lines are case insensitive. See REP I-D section "Protocol Definition". // https://tools.ietf.org/html/draft-koster-rep#section-2.1 TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {