google · willcritchlow · Nov 8, 2019 · Nov 11, 2019 · Nov 11, 2019 · Nov 12, 2019
diff --git a/robots.cc b/robots.cc
@@ -21,7 +21,7 @@
 // with Google-specific optimizations detailed at
 //   https://developers.google.com/search/reference/robots_txt
 
-#include "robots.h"
+#include "./robots.h"
 
 #include <stdlib.h>
 

diff --git a/robots.h b/robots.h
@@ -112,7 +112,9 @@ class RobotsMatcher : protected RobotsParseHandler {
   static bool IsValidUserAgentToObey(absl::string_view user_agent);
 
   // Returns true iff 'url' is allowed to be fetched by any member of the
-  // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
+  // "user_agents" vector after collapsing all rules applying to any member of
+  // the "user_agents" vector into a single ruleset. 'url' must be %-encoded
+  // according to RFC3986.
   bool AllowedByRobots(absl::string_view robots_body,
                        const std::vector<std::string>* user_agents,
                        const std::string& url);

diff --git a/robots_main.cc b/robots_main.cc
@@ -16,26 +16,30 @@
 // File: robots_main.cc
 // -----------------------------------------------------------------------------
 //
-// Simple binary to assess whether a URL is accessible to a user-agent according
-// to records found in a local robots.txt file, based on Google's robots.txt
-// parsing and matching algorithms.
+// Simple binary to assess whether a URL is accessible to a set of user-agents
+// according to records found in a local robots.txt file, based on Google's
+// robots.txt parsing and matching algorithms.
 // Usage:
 //     robots_main <local_path_to_robotstxt> <user_agent> <url>
 // Arguments:
 // local_path_to_robotstxt: local path to a file containing robots.txt records.
 //   For example: /home/users/username/robots.txt
-// user_agent: a token to be matched against records in the robots.txt.
-//   For example: Googlebot
+// user_agent: a token to be matched against records in the robots.txt (or a
+// comma-separated list of user agents)
+//   For example: Googlebot or Googlebot,Googlebot-image
 // url: a url to be matched against records in the robots.txt. The URL must be
 // %-encoded according to RFC3986.
 //   For example: https://example.com/accessible/url.html
 // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
-// to access 'url' based on records in 'local_path_to_robotstxt'.
+// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple
+// user agents are provided, check them as a vector based on functionality in
+// "AllowedByRobots" method.
 //
 #include <fstream>
 #include <iostream>
+#include <sstream>
 
-#include "robots.h"
+#include "./robots.h"
 
 bool LoadFile(const std::string& filename, std::string* result) {
   std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
@@ -86,13 +90,18 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  std::string user_agent = argv[2];
-  std::vector<std::string> user_agents(1, user_agent);
+  std::string input_useragents = argv[2];
+  std::vector<std::string> useragents;
+  std::string ua;
+  std::istringstream ss(input_useragents);
+  while (std::getline(ss, ua, ',')) {
+      useragents.push_back(ua);
+  }
   googlebot::RobotsMatcher matcher;
   std::string url = argv[3];
-  bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
+  bool allowed = matcher.AllowedByRobots(robots_content, &useragents, url);
 
-  std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
+  std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3]
             << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
   if (robots_content.empty()) {
     std::cout << "notice: robots file is empty so all user-agents are allowed"

diff --git a/robots_test.cc b/robots_test.cc
@@ -15,7 +15,7 @@
 // This file tests the robots.txt parsing and matching code found in robots.cc
 // against the current Robots Exclusion Protocol (REP) internet draft (I-D).
 // https://tools.ietf.org/html/draft-koster-rep
-#include "robots.h"
+#include "./robots.h"
 
 #include <string>
 
@@ -33,6 +33,19 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt,
   return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
 }
 
+bool AllowedByRobots(const absl::string_view robotstxt,
+                     const std::string& input_useragents,
+                     const std::string& url) {
+  std::vector<std::string> useragents;
+  std::string ua;
+  std::istringstream ss(input_useragents);
+  while (std::getline(ss, ua, ',')) {
+      useragents.push_back(ua);
+  }
+  RobotsMatcher matcher;
+  return matcher.AllowedByRobots(robotstxt, &useragents, url);
+}
+
 // Google-specific: system test.
 TEST(RobotsUnittest, GoogleOnly_SystemTest) {
   const absl::string_view robotstxt =
@@ -123,6 +136,34 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) {
   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
 }
 
+// Test based on the documentation at
+// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents
+// "Only one group is valid for a particular crawler"
+// "The group followed is group 1. Only the most specific group is followed,
+// all others are ignored"
+TEST(RobotsUnittest, ID_Multiple_Useragents) {
+  const absl::string_view robotstxt =
+      "user-agent: googlebot-news\n"
+      "Disallow: /bar/\n"
+      "\n"
+      "user-agent: *\n"
+      "Disallow: /baz/\n"
+      "\n\n"
+      "user-agent: googlebot\n"
+      "Disallow: /foo/\n";
+
+  const std::string url_foo = "http://foo.bar/foo/";
+  const std::string url_bar = "http://foo.bar/bar/";
+  const std::string url_baz = "http://foo.bar/baz/";
+  const std::string url_qux = "http://foo.bar/qux/";
+
+  // the first test currently fails
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo));
+  EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar));
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz));
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux));
+}
+
 // REP lines are case insensitive. See REP I-D section "Protocol Definition".
 // https://tools.ietf.org/html/draft-koster-rep#section-2.1
 TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {