Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion robots.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
// with Google-specific optimizations detailed at
// https://developers.google.com/search/reference/robots_txt

#include "robots.h"
#include "./robots.h"

#include <stdlib.h>

Expand Down
4 changes: 3 additions & 1 deletion robots.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ class RobotsMatcher : protected RobotsParseHandler {
static bool IsValidUserAgentToObey(absl::string_view user_agent);

// Returns true iff 'url' is allowed to be fetched by any member of the
// "user_agents" vector. 'url' must be %-encoded according to RFC3986.
// "user_agents" vector after collapsing all rules applying to any member of
// the "user_agents" vector into a single ruleset. 'url' must be %-encoded
// according to RFC3986.
bool AllowedByRobots(absl::string_view robots_body,
const std::vector<std::string>* user_agents,
const std::string& url);
Expand Down
31 changes: 20 additions & 11 deletions robots_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,30 @@
// File: robots_main.cc
// -----------------------------------------------------------------------------
//
// Simple binary to assess whether a URL is accessible to a user-agent according
// to records found in a local robots.txt file, based on Google's robots.txt
// parsing and matching algorithms.
// Simple binary to assess whether a URL is accessible to a set of user-agents
// according to records found in a local robots.txt file, based on Google's
// robots.txt parsing and matching algorithms.
// Usage:
// robots_main <local_path_to_robotstxt> <user_agent> <url>
// Arguments:
// local_path_to_robotstxt: local path to a file containing robots.txt records.
// For example: /home/users/username/robots.txt
// user_agent: a token to be matched against records in the robots.txt.
// For example: Googlebot
// user_agent: a token to be matched against records in the robots.txt (or a
// comma-separated list of user agents)
// For example: Googlebot or Googlebot,Googlebot-image
// url: a url to be matched against records in the robots.txt. The URL must be
// %-encoded according to RFC3986.
// For example: https://example.com/accessible/url.html
// Returns: Prints a sentence with verdict about whether 'user_agent' is allowed
// to access 'url' based on records in 'local_path_to_robotstxt'.
// to access 'url' based on records in 'local_path_to_robotstxt'. When multiple
// user agents are provided, check them as a vector based on functionality in
// "AllowedByRobots" method.
//
#include <fstream>
#include <iostream>
#include <sstream>

#include "robots.h"
#include "./robots.h"

bool LoadFile(const std::string& filename, std::string* result) {
std::ifstream file(filename, std::ios::in | std::ios::binary | std::ios::ate);
Expand Down Expand Up @@ -86,13 +90,18 @@ int main(int argc, char** argv) {
return 1;
}

std::string user_agent = argv[2];
std::vector<std::string> user_agents(1, user_agent);
std::string input_useragents = argv[2];
std::vector<std::string> useragents;
std::string ua;
std::istringstream ss(input_useragents);
while (std::getline(ss, ua, ',')) {
useragents.push_back(ua);
}
googlebot::RobotsMatcher matcher;
std::string url = argv[3];
bool allowed = matcher.AllowedByRobots(robots_content, &user_agents, url);
bool allowed = matcher.AllowedByRobots(robots_content, &useragents, url);

std::cout << "user-agent '" << user_agent << "' with URI '" << argv[3]
std::cout << "user-agent '" << input_useragents << "' with URI '" << argv[3]
<< "': " << (allowed ? "ALLOWED" : "DISALLOWED") << std::endl;
if (robots_content.empty()) {
std::cout << "notice: robots file is empty so all user-agents are allowed"
Expand Down
43 changes: 42 additions & 1 deletion robots_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// This file tests the robots.txt parsing and matching code found in robots.cc
// against the current Robots Exclusion Protocol (REP) internet draft (I-D).
// https://tools.ietf.org/html/draft-koster-rep
#include "robots.h"
#include "./robots.h"

#include <string>

Expand All @@ -33,6 +33,19 @@ bool IsUserAgentAllowed(const absl::string_view robotstxt,
return matcher.OneAgentAllowedByRobots(robotstxt, useragent, url);
}

bool AllowedByRobots(const absl::string_view robotstxt,
const std::string& input_useragents,
const std::string& url) {
std::vector<std::string> useragents;
std::string ua;
std::istringstream ss(input_useragents);
while (std::getline(ss, ua, ',')) {
useragents.push_back(ua);
}
RobotsMatcher matcher;
return matcher.AllowedByRobots(robotstxt, &useragents, url);
}

// Google-specific: system test.
TEST(RobotsUnittest, GoogleOnly_SystemTest) {
const absl::string_view robotstxt =
Expand Down Expand Up @@ -123,6 +136,34 @@ TEST(RobotsUnittest, ID_LineSyntax_Groups) {
EXPECT_FALSE(IsUserAgentAllowed(robotstxt, "BazBot", url_foo));
}

// Test based on the documentation at
// https://developers.google.com/search/reference/robots_txt#order-of-precedence-for-user-agents
// "Only one group is valid for a particular crawler"
// "The group followed is group 1. Only the most specific group is followed,
// all others are ignored"
TEST(RobotsUnittest, ID_Multiple_Useragents) {
const absl::string_view robotstxt =
"user-agent: googlebot-news\n"
"Disallow: /bar/\n"
"\n"
"user-agent: *\n"
"Disallow: /baz/\n"
"\n\n"
"user-agent: googlebot\n"
"Disallow: /foo/\n";

const std::string url_foo = "http://foo.bar/foo/";
const std::string url_bar = "http://foo.bar/bar/";
const std::string url_baz = "http://foo.bar/baz/";
const std::string url_qux = "http://foo.bar/qux/";

// the first test currently fails
EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_foo));
EXPECT_FALSE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_bar));
EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_baz));
EXPECT_TRUE(AllowedByRobots(robotstxt, "googlebot,googlebot-news", url_qux));
}

// REP lines are case insensitive. See REP I-D section "Protocol Definition".
// https://tools.ietf.org/html/draft-koster-rep#section-2.1
TEST(RobotsUnittest, ID_REPLineNamesCaseInsensitive) {
Expand Down