diff --git a/src/agent.cpp b/src/agent.cpp index 33824c5..da7768a 100644 --- a/src/agent.cpp +++ b/src/agent.cpp @@ -13,6 +13,13 @@ namespace { return url.defrag().escape().fullpath(); } + + std::string trim_front(const std::string& str, const char chr) + { + auto itr = std::find_if(str.begin(), str.end(), + [chr](const char c) {return c != chr;}); + return std::string(itr, str.end()); + } } namespace Rep @@ -25,6 +32,12 @@ namespace Rep { return *this; } + // leading wildcard? + if (query.front() == '*') + { + Url::Url trimmed(trim_front(query, '*')); + directives_.push_back(Directive(escape_url(trimmed), true)); + } directives_.push_back(Directive(escape_url(url), true)); sorted_ = false; return *this; @@ -45,6 +58,12 @@ namespace Rep { return *this; } + // leading wildcard? + if (query.front() == '*') + { + Url::Url trimmed(trim_front(query, '*')); + directives_.push_back(Directive(escape_url(trimmed), false)); + } directives_.push_back(Directive(escape_url(url), false)); } sorted_ = false; @@ -55,9 +74,10 @@ namespace Rep { if (!sorted_) { - std::sort(directives_.begin(), directives_.end(), [](const Directive& a, const Directive& b) { - return b.priority() < a.priority(); - }); + std::sort(directives_.begin(), directives_.end(), + [](const Directive& a, const Directive& b) { + return b.priority() < a.priority(); + }); sorted_ = true; } return directives_; diff --git a/test/test-robots.cpp b/test/test-robots.cpp index 4e87136..7c61352 100644 --- a/test/test-robots.cpp +++ b/test/test-robots.cpp @@ -319,3 +319,39 @@ TEST(RobotsTest, NeverExternalAllowed) Rep::Robots robot("", "http://a.com/robots.txt"); EXPECT_FALSE(robot.allowed("http://b.com/", "one")); } + +TEST(RobotsTest, LeadingWildcardAllow) +{ + std::string content = + "User-agent: meow\n" + "Disallow: /\n" + "Allow: ****/cats\n" + "Allow: */kangaroos\n"; + Rep::Robots robot(content); + + EXPECT_FALSE(robot.allowed("/kangaroo/zebra/cat/page.html", "meow")); + EXPECT_TRUE(robot.allowed("/cats.html", "meow")); + EXPECT_TRUE(robot.allowed("/cats/page.html", "meow")); + EXPECT_TRUE(robot.allowed("/get/more/cats/page.html", "meow")); + EXPECT_TRUE(robot.allowed("/kangaroos/page.html", "meow")); + EXPECT_TRUE(robot.allowed("/heaps/of/kangaroos/page.html", "meow")); + EXPECT_TRUE(robot.allowed("/kangaroosandkoalas/page.html", "meow")); +} + +TEST(RobotsTest, LeadingWildcardDisallow) +{ + std::string content = + "User-agent: meow\n" + "Allow: /\n" + "Disallow: ****/cats\n" + "Disallow: */kangaroos\n"; + Rep::Robots robot(content); + + EXPECT_TRUE(robot.allowed("/kangaroo/zebra/cat/page.html", "meow")); + EXPECT_FALSE(robot.allowed("/cats.html", "meow")); + EXPECT_FALSE(robot.allowed("/cats/page.html", "meow")); + EXPECT_FALSE(robot.allowed("/get/more/cats/page.html", "meow")); + EXPECT_FALSE(robot.allowed("/kangaroos/page.html", "meow")); + EXPECT_FALSE(robot.allowed("/heaps/of/kangaroos/page.html", "meow")); + EXPECT_FALSE(robot.allowed("/kangaroosandkoalas/page.html", "meow")); +}