2024-10-20 09:23:55 +00:00
5 changed files with 76 additions and 4 deletions
--- a/hosts/gerrit01/default.nix
+++ b/hosts/gerrit01/default.nix
@ -23,6 +23,9 @@
    };
  };
  # Block all these crawlers!!
  bagel.services.nginx.crawler-blocker.enable = true;
  fileSystems."/gerrit-data" = {
    device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
    fsType = "ext4";
--- a/services/block-crawlers/blocked-ua.txt
+++ b/services/block-crawlers/blocked-ua.txt
@ -0,0 +1,40 @@
 AI2Bot
 Ai2Bot-Dolma
 Amazonbot
 anthropic-ai
 Applebot
 Applebot-Extended
 Bytespider
 CCBot
 ChatGPT-User
 Claude-Web
 ClaudeBot
 cohere-ai
 Diffbot
 FacebookBot
 facebookexternalhit
 FriendlyCrawler
 Google-Extended
 GoogleOther
 GoogleOther-Image
 GoogleOther-Video
 GPTBot
 iaskspider/2.0
 ICC-Crawler
 ImagesiftBot
 img2dataset
 ISSCyberRiskCrawler
 Kangaroo Bot
 Meta-ExternalAgent
 Meta-ExternalFetcher
 OAI-SearchBot
 omgili
 omgilibot
 PerplexityBot
 PetalBot
 Scrapy
 Sidetrade indexer bot
 Timpibot
 VelenPublicWebCrawler
 Webzio-Extended
 YouBot
--- a/services/block-crawlers/default.nix
+++ b/services/block-crawlers/default.nix
@ -0,0 +1,32 @@
 { pkgs, config, lib, ... }:
 let
  inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
  cfg = config.bagel.services.nginx.crawler-blocker;
  mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
    ${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
    Disallow: /
  '';
 in
 {
  options = {
    bagel.services.nginx.crawler-blocker = {
      enable = mkEnableOption "the crawler blocker";
      userAgents = mkOption {
        type = types.listOf types.str;
        default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
      };
    };
    services.nginx.virtualHosts = mkOption {
      type = types.attrsOf (types.submodule {
        config = {
          locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
            alias = mkRobotsFile cfg.userAgents;
          });
        };
      });
    };
  };
 }
--- a/services/default.nix
+++ b/services/default.nix
@ -1,5 +1,6 @@
 {
  imports = [
    ./block-crawlers
    ./gerrit
    ./channel-scripts
    ./hydra
--- a/services/gerrit/www.nix
+++ b/services/gerrit/www.nix
@ -29,10 +29,6 @@ in
            # NGINX should not give up super fast. Things can take time.
            proxy_read_timeout 3600;
          }
          location = /robots.txt {
            return 200 'User-agent: *\nAllow: /';
          }
        '';
      };