From 23ddb4b34dceeded599e51b2a465af8924ead709 Mon Sep 17 00:00:00 2001 From: Raito Bezarius Date: Sat, 19 Oct 2024 18:55:03 +0200 Subject: [PATCH] feat: block automatically crawlers if the blocker is enabled This help us getting rid of useless traffic by crawlers. It is enabled for gerrit01 which is suffering the most from this. Signed-off-by: Raito Bezarius --- hosts/gerrit01/default.nix | 3 ++ services/block-crawlers/blocked-ua.txt | 40 ++++++++++++++++++++++++++ services/block-crawlers/default.nix | 32 +++++++++++++++++++++ services/default.nix | 1 + services/gerrit/www.nix | 4 --- 5 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 services/block-crawlers/blocked-ua.txt create mode 100644 services/block-crawlers/default.nix diff --git a/hosts/gerrit01/default.nix b/hosts/gerrit01/default.nix index 004f0e5..085dce7 100755 --- a/hosts/gerrit01/default.nix +++ b/hosts/gerrit01/default.nix @@ -23,6 +23,9 @@ }; }; + # Block all these crawlers!! + bagel.services.nginx.crawler-blocker.enable = true; + fileSystems."/gerrit-data" = { device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4"; fsType = "ext4"; diff --git a/services/block-crawlers/blocked-ua.txt b/services/block-crawlers/blocked-ua.txt new file mode 100644 index 0000000..528f190 --- /dev/null +++ b/services/block-crawlers/blocked-ua.txt @@ -0,0 +1,40 @@ +AI2Bot +Ai2Bot-Dolma +Amazonbot +anthropic-ai +Applebot +Applebot-Extended +Bytespider +CCBot +ChatGPT-User +Claude-Web +ClaudeBot +cohere-ai +Diffbot +FacebookBot +facebookexternalhit +FriendlyCrawler +Google-Extended +GoogleOther +GoogleOther-Image +GoogleOther-Video +GPTBot +iaskspider/2.0 +ICC-Crawler +ImagesiftBot +img2dataset +ISSCyberRiskCrawler +Kangaroo Bot +Meta-ExternalAgent +Meta-ExternalFetcher +OAI-SearchBot +omgili +omgilibot +PerplexityBot +PetalBot +Scrapy +Sidetrade indexer bot +Timpibot +VelenPublicWebCrawler +Webzio-Extended +YouBot diff --git a/services/block-crawlers/default.nix b/services/block-crawlers/default.nix new file mode 100644 index 0000000..98924d4 --- /dev/null +++ b/services/block-crawlers/default.nix @@ -0,0 +1,32 @@ +{ pkgs, config, lib, ... }: +let + inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault; + cfg = config.bagel.services.nginx.crawler-blocker; + mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" '' + ${concatStringsSep "\n" (map (ua: "User-Agent: ${ua}") blockedUAs)} + Disallow: / + ''; +in +{ + options = { + bagel.services.nginx.crawler-blocker = { + enable = mkEnableOption "the crawler blocker"; + + userAgents = mkOption { + type = types.listOf types.str; + default = builtins.split "\n" (builtins.readFile ./blocked-ua.txt); + }; + }; + + services.nginx.virtualHosts = mkOption { + type = types.attrsOf (types.submodule { + config = { + locations."= /robots.txt" = mkIf cfg.enable (mkDefault { + alias = mkRobotsFile cfg.userAgents; + }); + }; + }); + }; + }; +} + diff --git a/services/default.nix b/services/default.nix index 6cbad0f..08a2272 100644 --- a/services/default.nix +++ b/services/default.nix @@ -1,5 +1,6 @@ { imports = [ + ./block-crawlers ./gerrit ./channel-scripts ./hydra diff --git a/services/gerrit/www.nix b/services/gerrit/www.nix index 83cdcb0..e2b8561 100644 --- a/services/gerrit/www.nix +++ b/services/gerrit/www.nix @@ -29,10 +29,6 @@ in # NGINX should not give up super fast. Things can take time. proxy_read_timeout 3600; } - - location = /robots.txt { - return 200 'User-agent: *\nAllow: /'; - } ''; };