From 51cc0d4e81e21ab7649392e7096fb45c52e9cb1e Mon Sep 17 00:00:00 2001
From: Will Critchlow <will.critchlow@distilled.net>
Date: Fri, 8 Nov 2019 12:55:52 +0000
Subject: [PATCH 1/4] Allow multiple user agents

Take a comma-delimited list of user agents and pass them as a vector.
---
 robots_main.cc | 14 ++++++++++----
 robots_test.cc | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)
diff --git a/robots_main.cc b/robots_main.cc
index 0e60f02..ac16321 100644
--- a/robots_main.cc
+++ b/robots_main.cc
@@ -34,6 +34,7 @@
 //
 #include <fstream>
 #include <iostream>
+#include <sstream>
 
 #include "robots.h"
 
@@ -86,13 +87,18 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  std::string user_agent = argv[2];
-  std::vector<std::string> user_agents(1, user_agent);
+  std::string input_useragents = argv[2];
+  std::vector<std::string> useragents;
+  std::string ua;
+  std::istringstream ss(input_useragents);
+  while(std::getline(ss, ua, ',')) {
+      useragents.push_back(ua);
+  }
   googlebot::RobotsMatcher matcher;
   std::string url = argv[3];
-  bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
+  bool allowed = matcher.AllowedByRobots(robots_content, &useragents, url);
 
-  std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
+  std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3]
             << "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
   if (robots_content.empty()) {
     std::cout << "notice: robots file is empty so all user-agents are allowed"
diff --git a/robots_test.cc b/robots_test.cc
index 369efcd..ea5a2d3 100644
--- a/robots_test.cc
+++ b/robots_test.cc
@@ -33,6 +33,18 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt,
   return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
 }
 
+bool AllowedByRobots(const absl::string_view robotstxt,
+                        const std::string& input_useragents, const std::string& url) {
+  std::vector<std::string> useragents;
+  std::string ua;
+  std::istringstream ss(input_useragents);
+  while(std::getline(ss, ua, ',')) {
+      useragents.push_back(ua);
+  }
+  RobotsMatcher matcher;
+  return matcher.AllowedByRobots(robotstxt, &useragents, url);
+}
+
 // Google-specific: system test.
 TEST(RobotsUnittest, GoogleOnly_SystemTest) {
   const absl::string_view robotstxt =
@@ -123,6 +135,32 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) {
   EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
 }
 
+// Test based on the documentation at
+// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents
+// "Only one group is valid for a particular crawler"
+// "The group followed is group 1. Only the most specific group is followed, all others are ignored"
+TEST(RobotsUnittest, ID_Multiple_Useragents) {
+  const absl::string_view robotstxt =
+      "user-agent: googlebot-news\n"
+      "Disallow: /bar/\n"
+      "\n"
+      "user-agent: *\n"
+      "Disallow: /baz/\n"
+      "\n\n"
+      "user-agent: googlebot\n"
+      "Disallow: /foo/\n";
+
+  const std::string url_foo = "http://foo.bar/foo/";
+  const std::string url_bar = "http://foo.bar/bar/";
+  const std::string url_baz = "http://foo.bar/baz/";
+  const std::string url_qux = "http://foo.bar/qux/";
+
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo)); // this currently fails
+  EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar));
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz));
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux));
+}
+
 // REP lines are case insensitive. See REP I-D section "Protocol Definition".
 // https://tools.ietf.org/html/draft-koster-rep#section-2.1
 TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {

From 9d757576fbdfa054282c505ba32f7cf933fad649 Mon Sep 17 00:00:00 2001
From: Will Critchlow <will.critchlow@distilled.net>
Date: Mon, 11 Nov 2019 16:47:27 +0000
Subject: [PATCH 2/4] Modify comment to match behaviour

My best understanding of what the code actually does with a vector of user agents is treat it as if all the rules applying to any of the user agents are collapsed into a single ruleset applying to all user agents
---
 robots.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/robots.h b/robots.h
index adccef5..fa4b7d9 100644
--- a/robots.h
+++ b/robots.h
@@ -112,7 +112,9 @@ class RobotsMatcher : protected RobotsParseHandler {
   static bool IsValidUserAgentToObey(absl::string_view user_agent);
 
   // Returns true iff 'url' is allowed to be fetched by any member of the
-  // "user_agents" vector. 'url' must be %-encoded according to RFC3986.
+  // "user_agents" vector after collapsing all rules applying to any member of 
+  // the "user_agents" vector into a single ruleset. 'url' must be %-encoded 
+  // according to RFC3986.
   bool AllowedByRobots(absl::string_view robots_body,
                        const std::vector<std::string>* user_agents,
                        const std::string& url);

From 74d6d76277bf706f77673d7a3a5588c66b30c1bc Mon Sep 17 00:00:00 2001
From: Will Critchlow <will.critchlow@distilled.net>
Date: Mon, 11 Nov 2019 17:02:35 +0000
Subject: [PATCH 3/4] Update comment explaining how file works

Update comments to include possibility of passing in a vector of user agents
---
 robots_main.cc | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/robots_main.cc b/robots_main.cc
index ac16321..04dde56 100644
--- a/robots_main.cc
+++ b/robots_main.cc
@@ -16,21 +16,24 @@
 // File: robots_main.cc
 // -----------------------------------------------------------------------------
 //
-// Simple binary to assess whether a URL is accessible to a user-agent according
-// to records found in a local robots.txt file, based on Google's robots.txt
-// parsing and matching algorithms.
+// Simple binary to assess whether a URL is accessible to a set of user-agents 
+// according to records found in a local robots.txt file, based on Google's 
+// robots.txt parsing and matching algorithms.
 // Usage:
 //     robots_main <local_path_to_robotstxt> <user_agent> <url>
 // Arguments:
 // local_path_to_robotstxt: local path to a file containing robots.txt records.
 //   For example: /home/users/username/robots.txt
-// user_agent: a token to be matched against records in the robots.txt.
-//   For example: Googlebot
+// user_agent: a token to be matched against records in the robots.txt (or a
+// comma-separated list of user agents)
+//   For example: Googlebot or Googlebot,Googlebot-image
 // url: a url to be matched against records in the robots.txt. The URL must be
 // %-encoded according to RFC3986.
 //   For example: https://example.com/accessible/url.html
 // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
-// to access 'url' based on records in 'local_path_to_robotstxt'.
+// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple 
+// user agents are provided, check them as a vector based on functionality in 
+// "AllowedByRobots" method.
 //
 #include <fstream>
 #include <iostream>

From 3dfb0d2ba70d265977af7a7ff42808a34267b832 Mon Sep 17 00:00:00 2001
From: Will Critchlow <will.critchlow@distilled.net>
Date: Tue, 12 Nov 2019 13:24:06 +0000
Subject: [PATCH 4/4] Follow cpplint directives

Tidy up code to Google C++ standards
---
 robots.cc      |  2 +-
 robots.h       |  4 ++--
 robots_main.cc | 12 ++++++------
 robots_test.cc | 13 ++++++++-----
 4 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/robots.cc b/robots.cc
index bdbccea..12caaff 100644
--- a/robots.cc
+++ b/robots.cc
@@ -21,7 +21,7 @@
 // with Google-specific optimizations detailed at
 //   https://developers.google.com/search/reference/robots_txt
 
-#include "robots.h"
+#include "./robots.h"
 
 #include <stdlib.h>
 
diff --git a/robots.h b/robots.h
index fa4b7d9..f22038b 100644
--- a/robots.h
+++ b/robots.h
@@ -112,8 +112,8 @@ class RobotsMatcher : protected RobotsParseHandler {
   static bool IsValidUserAgentToObey(absl::string_view user_agent);
 
   // Returns true iff 'url' is allowed to be fetched by any member of the
-  // "user_agents" vector after collapsing all rules applying to any member of 
-  // the "user_agents" vector into a single ruleset. 'url' must be %-encoded 
+  // "user_agents" vector after collapsing all rules applying to any member of
+  // the "user_agents" vector into a single ruleset. 'url' must be %-encoded
   // according to RFC3986.
   bool AllowedByRobots(absl::string_view robots_body,
                        const std::vector<std::string>* user_agents,
diff --git a/robots_main.cc b/robots_main.cc
index 04dde56..a96fffa 100644
--- a/robots_main.cc
+++ b/robots_main.cc
@@ -16,8 +16,8 @@
 // File: robots_main.cc
 // -----------------------------------------------------------------------------
 //
-// Simple binary to assess whether a URL is accessible to a set of user-agents 
-// according to records found in a local robots.txt file, based on Google's 
+// Simple binary to assess whether a URL is accessible to a set of user-agents
+// according to records found in a local robots.txt file, based on Google's
 // robots.txt parsing and matching algorithms.
 // Usage:
 //     robots_main <local_path_to_robotstxt> <user_agent> <url>
@@ -31,15 +31,15 @@
 // %-encoded according to RFC3986.
 //   For example: https://example.com/accessible/url.html
 // Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
-// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple 
-// user agents are provided, check them as a vector based on functionality in 
+// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple
+// user agents are provided, check them as a vector based on functionality in
 // "AllowedByRobots" method.
 //
 #include <fstream>
 #include <iostream>
 #include <sstream>
 
-#include "robots.h"
+#include "./robots.h"
 
 bool LoadFile(const std::string& filename, std::string* result) {
   std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
@@ -94,7 +94,7 @@ int main(int argc, char** argv) {
   std::vector<std::string> useragents;
   std::string ua;
   std::istringstream ss(input_useragents);
-  while(std::getline(ss, ua, ',')) {
+  while (std::getline(ss, ua, ',')) {
       useragents.push_back(ua);
   }
   googlebot::RobotsMatcher matcher;
diff --git a/robots_test.cc b/robots_test.cc
index ea5a2d3..7e64e1f 100644
--- a/robots_test.cc
+++ b/robots_test.cc
@@ -15,7 +15,7 @@
 // This file tests the robots.txt parsing and matching code found in robots.cc
 // against the current Robots Exclusion Protocol (REP) internet draft (I-D).
 // https://tools.ietf.org/html/draft-koster-rep
-#include "robots.h"
+#include "./robots.h"
 
 #include <string>
 
@@ -34,11 +34,12 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt,
 }
 
 bool AllowedByRobots(const absl::string_view robotstxt,
-                        const std::string& input_useragents, const std::string& url) {
+                     const std::string& input_useragents,
+                     const std::string& url) {
   std::vector<std::string> useragents;
   std::string ua;
   std::istringstream ss(input_useragents);
-  while(std::getline(ss, ua, ',')) {
+  while (std::getline(ss, ua, ',')) {
       useragents.push_back(ua);
   }
   RobotsMatcher matcher;
@@ -138,7 +139,8 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) {
 // Test based on the documentation at
 // https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents
 // "Only one group is valid for a particular crawler"
-// "The group followed is group 1. Only the most specific group is followed, all others are ignored"
+// "The group followed is group 1. Only the most specific group is followed,
+// all others are ignored"
 TEST(RobotsUnittest, ID_Multiple_Useragents) {
   const absl::string_view robotstxt =
       "user-agent: googlebot-news\n"
@@ -155,7 +157,8 @@ TEST(RobotsUnittest, ID_Multiple_Useragents) {
   const std::string url_baz = "http://foo.bar/baz/";
   const std::string url_qux = "http://foo.bar/qux/";
 
-  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo)); // this currently fails
+  // the first test currently fails
+  EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo));
   EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar));
   EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz));
   EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux));