feat: block automatically crawlers if the blocker is enabled

This help us getting rid of useless traffic by crawlers. It is enabled for gerrit01 which is suffering the most from this. Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
2024-10-19 18:55:03 +02:00 · 2024-10-19 18:55:03 +02:00 · 23ddb4b34d
parent d5500d7c4e
commit 23ddb4b34d
5 changed files with 76 additions and 4 deletions
--- a/hosts/gerrit01/default.nix
+++ b/hosts/gerrit01/default.nix
@ -23,6 +23,9 @@
    };
  };
  # Block all these crawlers!!
  bagel.services.nginx.crawler-blocker.enable = true;
  fileSystems."/gerrit-data" = {
    device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
    fsType = "ext4";
--- a/services/block-crawlers/blocked-ua.txt
+++ b/services/block-crawlers/blocked-ua.txt
@ -0,0 +1,40 @@
 AI2Bot
 Ai2Bot-Dolma
 Amazonbot
 anthropic-ai
 Applebot
 Applebot-Extended
 Bytespider
 CCBot
 ChatGPT-User
 Claude-Web
 ClaudeBot
 cohere-ai
 Diffbot
 FacebookBot
 facebookexternalhit
 FriendlyCrawler
 Google-Extended
 GoogleOther
 GoogleOther-Image
 GoogleOther-Video
 GPTBot
 iaskspider/2.0
 ICC-Crawler
 ImagesiftBot
 img2dataset
 ISSCyberRiskCrawler
 Kangaroo Bot
 Meta-ExternalAgent
 Meta-ExternalFetcher
 OAI-SearchBot
 omgili
 omgilibot
 PerplexityBot
 PetalBot
 Scrapy
 Sidetrade indexer bot
 Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
 YouBot
--- a/services/block-crawlers/default.nix
+++ b/services/block-crawlers/default.nix
@ -0,0 +1,32 @@
 { pkgs, config, lib, ... }:
 let
  inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault;
  cfg = config.bagel.services.nginx.crawler-blocker;
  mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
    ${concatStringsSep "\n" (map (ua: "User-Agent: ${ua}") blockedUAs)}
    Disallow: /
  '';
 in
 {
  options = {
    bagel.services.nginx.crawler-blocker = {
      enable = mkEnableOption "the crawler blocker";
      userAgents = mkOption {
        type = types.listOf types.str;
        default = builtins.split "\n" (builtins.readFile ./blocked-ua.txt);
      };
    };
    services.nginx.virtualHosts = mkOption {
      type = types.attrsOf (types.submodule {
        config = {
          locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
            alias = mkRobotsFile cfg.userAgents;
          });
        };
      });
    };
  };
 }
--- a/services/default.nix
+++ b/services/default.nix
@ -1,5 +1,6 @@
 {
  imports = [
    ./block-crawlers
    ./gerrit
    ./channel-scripts
    ./hydra
--- a/services/gerrit/www.nix
+++ b/services/gerrit/www.nix
@ -29,10 +29,6 @@ in
            # NGINX should not give up super fast. Things can take time.
            proxy_read_timeout 3600;
          }
          location = /robots.txt {
            return 200 'User-agent: *\nAllow: /';
          }
        '';
      };