feat: block automatically crawlers if the blocker is enabled

This help us getting rid of useless traffic by crawlers. It is enabled for gerrit01 which is suffering the most from this. Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
2024-10-19 18:55:03 +02:00 · 2024-10-19 18:55:03 +02:00 · 8c0c7b517f
parent d5500d7c4e
commit 8c0c7b517f
5 changed files with 76 additions and 4 deletions
--- a/hosts/gerrit01/default.nix
+++ b/hosts/gerrit01/default.nix
@ -23,6 +23,9 @@
    };
  };

+  # Block all these crawlers!!
+  bagel.services.nginx.crawler-blocker.enable = true;
+
  fileSystems."/gerrit-data" = {
    device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
    fsType = "ext4";
--- a/services/block-crawlers/blocked-ua.txt
+++ b/services/block-crawlers/blocked-ua.txt
@ -0,0 +1,40 @@
+AI2Bot
+Ai2Bot-Dolma
+Amazonbot
+anthropic-ai
+Applebot
+Applebot-Extended
+Bytespider
+CCBot
+ChatGPT-User
+Claude-Web
+ClaudeBot
+cohere-ai
+Diffbot
+FacebookBot
+facebookexternalhit
+FriendlyCrawler
+Google-Extended
+GoogleOther
+GoogleOther-Image
+GoogleOther-Video
+GPTBot
+iaskspider/2.0
+ICC-Crawler
+ImagesiftBot
+img2dataset
+ISSCyberRiskCrawler
+Kangaroo Bot
+Meta-ExternalAgent
+Meta-ExternalFetcher
+OAI-SearchBot
+omgili
+omgilibot
+PerplexityBot
+PetalBot
+Scrapy
+Sidetrade indexer bot
+Timpibot
+VelenPublicWebCrawler
+Webzio-Extended
+YouBot
--- a/services/block-crawlers/default.nix
+++ b/services/block-crawlers/default.nix
@ -0,0 +1,32 @@
+{ pkgs, config, lib, ... }:
+let
+  inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
+  cfg = config.bagel.services.nginx.crawler-blocker;
+  mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
+    ${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
+    Disallow: /
+  '';
+in
+{
+  options = {
+    bagel.services.nginx.crawler-blocker = {
+      enable = mkEnableOption "the crawler blocker";
+
+      userAgents = mkOption {
+        type = types.listOf types.str;
+        default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
+      };
+    };
+
+    services.nginx.virtualHosts = mkOption {
+      type = types.attrsOf (types.submodule {
+        config = {
+          locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
+            alias = mkRobotsFile cfg.userAgents;
+          });
+        };
+      });
+    };
+  };
+}
+
--- a/services/default.nix
+++ b/services/default.nix
@ -1,5 +1,6 @@
 {
  imports = [
+    ./block-crawlers
    ./gerrit
    ./channel-scripts
    ./hydra
--- a/services/gerrit/www.nix
+++ b/services/gerrit/www.nix
@ -29,10 +29,6 @@ in
            # NGINX should not give up super fast. Things can take time.
            proxy_read_timeout 3600;
          }
-
-          location = /robots.txt {
-            return 200 'User-agent: *\nAllow: /';
-          }
        '';
      };