From 51cc0d4e81e21ab7649392e7096fb45c52e9cb1e Mon Sep 17 00:00:00 2001 From: Will Critchlow Date: Fri, 8 Nov 2019 12:55:52 +0000 Subject: [PATCH 1/4] Allow multiple user agents Take a comma-delimited list of user agents and pass them as a vector. --- robots_main.cc | 14 ++++++++++---- robots_test.cc | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/robots_main.cc b/robots_main.cc index 0e60f02..ac16321 100644 --- a/robots_main.cc +++ b/robots_main.cc @@ -34,6 +34,7 @@ // #include #include +#include #include "robots.h" @@ -86,13 +87,18 @@ int main(int argc, char** argv) { return 1; } - std::string user_agent = argv[2]; - std::vector user_agents(1, user_agent); + std::string input_useragents = argv[2]; + std::vector useragents; + std::string ua; + std::istringstream ss(input_useragents); + while(std::getline(ss, ua, ',')) { + useragents.push_back(ua); + } googlebot::RobotsMatcher matcher; std::string url = argv[3]; - bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url); + bool allowed = matcher.AllowedByRobots(robots_content, &useragents, url); - std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3] + std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3] << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl; if (robots_content.empty()) { std::cout << "notice: robots file is empty so all user-agents are allowed" diff --git a/robots_test.cc b/robots_test.cc index 369efcd..ea5a2d3 100644 --- a/robots_test.cc +++ b/robots_test.cc @@ -33,6 +33,18 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt, return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url); } +bool AllowedByRobots(const absl::string_view robotstxt, + const std::string& input_useragents, const std::string& url) { + std::vector useragents; + std::string ua; + std::istringstream ss(input_useragents); + while(std::getline(ss, ua, ',')) { + useragents.push_back(ua); + } + RobotsMatcher matcher; + return matcher.AllowedByRobots(robotstxt, &useragents, url); +} + // Google-specific: system test. TEST(RobotsUnittest, GoogleOnly_SystemTest) { const absl::string_view robotstxt = @@ -123,6 +135,32 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) { EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo)); } +// Test based on the documentation at +// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents +// "Only one group is valid for a particular crawler" +// "The group followed is group 1. Only the most specific group is followed, all others are ignored" +TEST(RobotsUnittest, ID_Multiple_Useragents) { + const absl::string_view robotstxt = + "user-agent: googlebot-news\n" + "Disallow: /bar/\n" + "\n" + "user-agent: *\n" + "Disallow: /baz/\n" + "\n\n" + "user-agent: googlebot\n" + "Disallow: /foo/\n"; + + const std::string url_foo = "http://foo.bar/foo/"; + const std::string url_bar = "http://foo.bar/bar/"; + const std::string url_baz = "http://foo.bar/baz/"; + const std::string url_qux = "http://foo.bar/qux/"; + + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo)); // this currently fails + EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar)); + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz)); + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux)); +} + // REP lines are case insensitive. See REP I-D section "Protocol Definition". // https://tools.ietf.org/html/draft-koster-rep#section-2.1 TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) { From 9d757576fbdfa054282c505ba32f7cf933fad649 Mon Sep 17 00:00:00 2001 From: Will Critchlow Date: Mon, 11 Nov 2019 16:47:27 +0000 Subject: [PATCH 2/4] Modify comment to match behaviour My best understanding of what the code actually does with a vector of user agents is treat it as if all the rules applying to any of the user agents are collapsed into a single ruleset applying to all user agents --- robots.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/robots.h b/robots.h index adccef5..fa4b7d9 100644 --- a/robots.h +++ b/robots.h @@ -112,7 +112,9 @@ class RobotsMatcher : protected RobotsParseHandler { static bool IsValidUserAgentToObey(absl::string_view user_agent); // Returns true iff 'url' is allowed to be fetched by any member of the - // "user_agents" vector. 'url' must be %-encoded according to RFC3986. + // "user_agents" vector after collapsing all rules applying to any member of + // the "user_agents" vector into a single ruleset. 'url' must be %-encoded + // according to RFC3986. bool AllowedByRobots(absl::string_view robots_body, const std::vector* user_agents, const std::string& url); From 74d6d76277bf706f77673d7a3a5588c66b30c1bc Mon Sep 17 00:00:00 2001 From: Will Critchlow Date: Mon, 11 Nov 2019 17:02:35 +0000 Subject: [PATCH 3/4] Update comment explaining how file works Update comments to include possibility of passing in a vector of user agents --- robots_main.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/robots_main.cc b/robots_main.cc index ac16321..04dde56 100644 --- a/robots_main.cc +++ b/robots_main.cc @@ -16,21 +16,24 @@ // File: robots_main.cc // ----------------------------------------------------------------------------- // -// Simple binary to assess whether a URL is accessible to a user-agent according -// to records found in a local robots.txt file, based on Google's robots.txt -// parsing and matching algorithms. +// Simple binary to assess whether a URL is accessible to a set of user-agents +// according to records found in a local robots.txt file, based on Google's +// robots.txt parsing and matching algorithms. // Usage: // robots_main // Arguments: // local_path_to_robotstxt: local path to a file containing robots.txt records. // For example: /home/users/username/robots.txt -// user_agent: a token to be matched against records in the robots.txt. -// For example: Googlebot +// user_agent: a token to be matched against records in the robots.txt (or a +// comma-separated list of user agents) +// For example: Googlebot or Googlebot,Googlebot-image // url: a url to be matched against records in the robots.txt. The URL must be // %-encoded according to RFC3986. // For example: https://example.com/accessible/url.html // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed -// to access 'url' based on records in 'local_path_to_robotstxt'. +// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple +// user agents are provided, check them as a vector based on functionality in +// "AllowedByRobots" method. // #include #include From 3dfb0d2ba70d265977af7a7ff42808a34267b832 Mon Sep 17 00:00:00 2001 From: Will Critchlow Date: Tue, 12 Nov 2019 13:24:06 +0000 Subject: [PATCH 4/4] Follow cpplint directives Tidy up code to Google C++ standards --- robots.cc | 2 +- robots.h | 4 ++-- robots_main.cc | 12 ++++++------ robots_test.cc | 13 ++++++++----- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/robots.cc b/robots.cc index bdbccea..12caaff 100644 --- a/robots.cc +++ b/robots.cc @@ -21,7 +21,7 @@ // with Google-specific optimizations detailed at // https://developers.google.com/search/reference/robots_txt -#include "robots.h" +#include "./robots.h" #include diff --git a/robots.h b/robots.h index fa4b7d9..f22038b 100644 --- a/robots.h +++ b/robots.h @@ -112,8 +112,8 @@ class RobotsMatcher : protected RobotsParseHandler { static bool IsValidUserAgentToObey(absl::string_view user_agent); // Returns true iff 'url' is allowed to be fetched by any member of the - // "user_agents" vector after collapsing all rules applying to any member of - // the "user_agents" vector into a single ruleset. 'url' must be %-encoded + // "user_agents" vector after collapsing all rules applying to any member of + // the "user_agents" vector into a single ruleset. 'url' must be %-encoded // according to RFC3986. bool AllowedByRobots(absl::string_view robots_body, const std::vector* user_agents, diff --git a/robots_main.cc b/robots_main.cc index 04dde56..a96fffa 100644 --- a/robots_main.cc +++ b/robots_main.cc @@ -16,8 +16,8 @@ // File: robots_main.cc // ----------------------------------------------------------------------------- // -// Simple binary to assess whether a URL is accessible to a set of user-agents -// according to records found in a local robots.txt file, based on Google's +// Simple binary to assess whether a URL is accessible to a set of user-agents +// according to records found in a local robots.txt file, based on Google's // robots.txt parsing and matching algorithms. // Usage: // robots_main @@ -31,15 +31,15 @@ // %-encoded according to RFC3986. // For example: https://example.com/accessible/url.html // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed -// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple -// user agents are provided, check them as a vector based on functionality in +// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple +// user agents are provided, check them as a vector based on functionality in // "AllowedByRobots" method. // #include #include #include -#include "robots.h" +#include "./robots.h" bool LoadFile(const std::string& filename, std::string* result) { std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate); @@ -94,7 +94,7 @@ int main(int argc, char** argv) { std::vector useragents; std::string ua; std::istringstream ss(input_useragents); - while(std::getline(ss, ua, ',')) { + while (std::getline(ss, ua, ',')) { useragents.push_back(ua); } googlebot::RobotsMatcher matcher; diff --git a/robots_test.cc b/robots_test.cc index ea5a2d3..7e64e1f 100644 --- a/robots_test.cc +++ b/robots_test.cc @@ -15,7 +15,7 @@ // This file tests the robots.txt parsing and matching code found in robots.cc // against the current Robots Exclusion Protocol (REP) internet draft (I-D). // https://tools.ietf.org/html/draft-koster-rep -#include "robots.h" +#include "./robots.h" #include @@ -34,11 +34,12 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt, } bool AllowedByRobots(const absl::string_view robotstxt, - const std::string& input_useragents, const std::string& url) { + const std::string& input_useragents, + const std::string& url) { std::vector useragents; std::string ua; std::istringstream ss(input_useragents); - while(std::getline(ss, ua, ',')) { + while (std::getline(ss, ua, ',')) { useragents.push_back(ua); } RobotsMatcher matcher; @@ -138,7 +139,8 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) { // Test based on the documentation at // https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents // "Only one group is valid for a particular crawler" -// "The group followed is group 1. Only the most specific group is followed, all others are ignored" +// "The group followed is group 1. Only the most specific group is followed, +// all others are ignored" TEST(RobotsUnittest, ID_Multiple_Useragents) { const absl::string_view robotstxt = "user-agent: googlebot-news\n" @@ -155,7 +157,8 @@ TEST(RobotsUnittest, ID_Multiple_Useragents) { const std::string url_baz = "http://foo.bar/baz/"; const std::string url_qux = "http://foo.bar/qux/"; - EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo)); // this currently fails + // the first test currently fails + EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo)); EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar)); EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz)); EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux));