2020-09-15 20:09:37 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
|
|
module Gitlab
|
|
|
|
module RobotsTxt
|
|
|
|
class Parser
|
2020-10-27 08:08:33 -04:00
|
|
|
DISALLOW_REGEX = /^disallow: /i.freeze
|
|
|
|
ALLOW_REGEX = /^allow: /i.freeze
|
|
|
|
|
|
|
|
attr_reader :disallow_rules, :allow_rules
|
2020-09-15 20:09:37 -04:00
|
|
|
|
|
|
|
def initialize(content)
|
|
|
|
@raw_content = content
|
|
|
|
|
2020-10-27 08:08:33 -04:00
|
|
|
@disallow_rules, @allow_rules = parse_raw_content!
|
2020-09-15 20:09:37 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
def disallowed?(path)
|
2020-10-27 08:08:33 -04:00
|
|
|
return false if allow_rules.any? { |rule| path =~ rule }
|
|
|
|
|
2020-09-15 20:09:37 -04:00
|
|
|
disallow_rules.any? { |rule| path =~ rule }
|
|
|
|
end
|
|
|
|
|
|
|
|
private
|
|
|
|
|
2020-10-27 08:08:33 -04:00
|
|
|
# This parser is very basic as it only knows about `Disallow:`
|
|
|
|
# and `Allow:` lines, and simply ignores all other lines.
|
2020-09-15 20:09:37 -04:00
|
|
|
#
|
2020-10-27 08:08:33 -04:00
|
|
|
# Patterns ending in `$`, and `*` for 0 or more characters are recognized.
|
|
|
|
#
|
|
|
|
# It is case insensitive and `Allow` rules takes precedence
|
|
|
|
# over `Disallow`.
|
2020-09-15 20:09:37 -04:00
|
|
|
def parse_raw_content!
|
2020-10-27 08:08:33 -04:00
|
|
|
disallowed = []
|
|
|
|
allowed = []
|
|
|
|
|
|
|
|
@raw_content.each_line.each do |line|
|
|
|
|
if disallow_rule?(line)
|
|
|
|
disallowed << get_disallow_pattern(line)
|
|
|
|
elsif allow_rule?(line)
|
|
|
|
allowed << get_allow_pattern(line)
|
2020-09-15 20:09:37 -04:00
|
|
|
end
|
2020-10-27 08:08:33 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
[disallowed, allowed]
|
|
|
|
end
|
|
|
|
|
|
|
|
def disallow_rule?(line)
|
|
|
|
line =~ DISALLOW_REGEX
|
|
|
|
end
|
|
|
|
|
|
|
|
def get_disallow_pattern(line)
|
|
|
|
get_pattern(line, DISALLOW_REGEX)
|
|
|
|
end
|
|
|
|
|
|
|
|
def allow_rule?(line)
|
|
|
|
line =~ ALLOW_REGEX
|
|
|
|
end
|
|
|
|
|
|
|
|
def get_allow_pattern(line)
|
|
|
|
get_pattern(line, ALLOW_REGEX)
|
|
|
|
end
|
|
|
|
|
|
|
|
def get_pattern(line, rule_regex)
|
|
|
|
value = line.sub(rule_regex, '').strip
|
|
|
|
value = Regexp.escape(value).gsub('\*', '.*')
|
|
|
|
value = value.sub(/\\\$$/, '$')
|
|
|
|
Regexp.new("^#{value}")
|
2020-09-15 20:09:37 -04:00
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|