diff --git a/hosts/gerrit01/default.nix b/hosts/gerrit01/default.nix index 004f0e5..085dce7 100755 --- a/hosts/gerrit01/default.nix +++ b/hosts/gerrit01/default.nix @@ -23,6 +23,9 @@ }; }; + # Block all these crawlers!! + bagel.services.nginx.crawler-blocker.enable = true; + fileSystems."/gerrit-data" = { device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4"; fsType = "ext4"; diff --git a/services/block-crawlers/blocked-ua.txt b/services/block-crawlers/blocked-ua.txt new file mode 100644 index 0000000..528f190 --- /dev/null +++ b/services/block-crawlers/blocked-ua.txt @@ -0,0 +1,40 @@ +AI2Bot +Ai2Bot-Dolma +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +Diffbot +FacebookBot +facebookexternalhit +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +Meta-ExternalAgent +Meta-ExternalFetcher +OAI-SearchBot +omgili +omgilibot +PerplexityBot +PetalBot +Scrapy +Sidetrade indexer bot +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot diff --git a/services/block-crawlers/default.nix b/services/block-crawlers/default.nix new file mode 100644 index 0000000..98924d4 --- /dev/null +++ b/services/block-crawlers/default.nix @@ -0,0 +1,32 @@ +{ pkgs, config, lib, ... }: +let + inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault; + cfg = config.bagel.services.nginx.crawler-blocker; + mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" '' + ${concatStringsSep "\n" (map (ua: "User-Agent: ${ua}") blockedUAs)} + Disallow: / + ''; +in +{ + options = { + bagel.services.nginx.crawler-blocker = { + enable = mkEnableOption "the crawler blocker"; + + userAgents = mkOption { + type = types.listOf types.str; + default = builtins.split "\n" (builtins.readFile ./blocked-ua.txt); + }; + }; + + services.nginx.virtualHosts = mkOption { + type = types.attrsOf (types.submodule { + config = { + locations."= /robots.txt" = mkIf cfg.enable (mkDefault { + alias = mkRobotsFile cfg.userAgents; + }); + }; + }); + }; + }; +} + diff --git a/services/default.nix b/services/default.nix index 6cbad0f..08a2272 100644 --- a/services/default.nix +++ b/services/default.nix @@ -1,5 +1,6 @@ { imports = [ + ./block-crawlers ./gerrit ./channel-scripts ./hydra diff --git a/services/gerrit/www.nix b/services/gerrit/www.nix index 83cdcb0..e2b8561 100644 --- a/services/gerrit/www.nix +++ b/services/gerrit/www.nix @@ -29,10 +29,6 @@ in # NGINX should not give up super fast. Things can take time. proxy_read_timeout 3600; } - - location = /robots.txt { - return 200 'User-agent: *\nAllow: /'; - } ''; };