2024-10-20 09:23:55 +00:00
5 changed files with 76 additions and 4 deletions
--- a/hosts/gerrit01/default.nix
+++ b/hosts/gerrit01/default.nix
@ -23,6 +23,9 @@
    };
  };

+  # Block all these crawlers!!
+  bagel.services.nginx.crawler-blocker.enable = true;
+
  fileSystems."/gerrit-data" = {
    device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
    fsType = "ext4";
--- a/services/block-crawlers/blocked-ua.txt
+++ b/services/block-crawlers/blocked-ua.txt
@ -0,0 +1,40 @@
+AI2Bot
+Ai2Bot-Dolma
+Amazonbot
+anthropic-ai
+Applebot
+Applebot-Extended
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+cohere-ai
+Diffbot
+FacebookBot
+facebookexternalhit
+FriendlyCrawler
+Google-Extended
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iaskspider/2.0
+ICC-Crawler
+ImagesiftBot
+img2dataset
+ISSCyberRiskCrawler
+Kangaroo Bot
+Meta-ExternalAgent
+Meta-ExternalFetcher
+OAI-SearchBot
+omgili
+omgilibot
+PerplexityBot
+PetalBot
+Scrapy
+Sidetrade indexer bot
+Timpibot
+VelenPublicWebCrawler
+Webzio-Extended
+YouBot
--- a/services/block-crawlers/default.nix
+++ b/services/block-crawlers/default.nix
@ -0,0 +1,32 @@
+{ pkgs, config, lib, ... }:
+let
+  inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
+  cfg = config.bagel.services.nginx.crawler-blocker;
+  mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
+    ${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
+    Disallow: /
+  '';
+in
+{
+  options = {
+    bagel.services.nginx.crawler-blocker = {
+      enable = mkEnableOption "the crawler blocker";
+
+      userAgents = mkOption {
+        type = types.listOf types.str;
+        default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
+      };
+    };
+
+    services.nginx.virtualHosts = mkOption {
+      type = types.attrsOf (types.submodule {
+        config = {
+          locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
+            alias = mkRobotsFile cfg.userAgents;
+          });
+        };
+      });
+    };
+  };
+}
+
--- a/services/default.nix
+++ b/services/default.nix
@ -1,5 +1,6 @@
 {
  imports = [
+    ./block-crawlers
    ./gerrit
    ./channel-scripts
    ./hydra
--- a/services/gerrit/www.nix
+++ b/services/gerrit/www.nix
@ -29,10 +29,6 @@ in
            # NGINX should not give up super fast. Things can take time.
            proxy_read_timeout 3600;
          }
-
-          location = /robots.txt {
-            return 200 'User-agent: *\nAllow: /';
-          }
        '';
      };