feat: block automatically crawlers if the blocker is enabled

This help us getting rid of useless traffic by crawlers.

It is enabled for gerrit01 which is suffering the most from this.

Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
This commit is contained in:
raito 2024-10-19 18:55:03 +02:00
parent d5500d7c4e
commit 23ddb4b34d
5 changed files with 76 additions and 4 deletions

View file

@ -23,6 +23,9 @@
};
};
# Block all these crawlers!!
bagel.services.nginx.crawler-blocker.enable = true;
fileSystems."/gerrit-data" = {
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
fsType = "ext4";

View file

@ -0,0 +1,40 @@
AI2Bot
Ai2Bot-Dolma
Amazonbot
anthropic-ai
Applebot
Applebot-Extended
Bytespider
CCBot
ChatGPT-User
Claude-Web
ClaudeBot
cohere-ai
Diffbot
FacebookBot
facebookexternalhit
FriendlyCrawler
Google-Extended
GoogleOther
GoogleOther-Image
GoogleOther-Video
GPTBot
iaskspider/2.0
ICC-Crawler
ImagesiftBot
img2dataset
ISSCyberRiskCrawler
Kangaroo Bot
Meta-ExternalAgent
Meta-ExternalFetcher
OAI-SearchBot
omgili
omgilibot
PerplexityBot
PetalBot
Scrapy
Sidetrade indexer bot
Timpibot
VelenPublicWebCrawler
Webzio-Extended
YouBot

View file

@ -0,0 +1,32 @@
{ pkgs, config, lib, ... }:
let
inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault;
cfg = config.bagel.services.nginx.crawler-blocker;
mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
${concatStringsSep "\n" (map (ua: "User-Agent: ${ua}") blockedUAs)}
Disallow: /
'';
in
{
options = {
bagel.services.nginx.crawler-blocker = {
enable = mkEnableOption "the crawler blocker";
userAgents = mkOption {
type = types.listOf types.str;
default = builtins.split "\n" (builtins.readFile ./blocked-ua.txt);
};
};
services.nginx.virtualHosts = mkOption {
type = types.attrsOf (types.submodule {
config = {
locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
alias = mkRobotsFile cfg.userAgents;
});
};
});
};
};
}

View file

@ -1,5 +1,6 @@
{
imports = [
./block-crawlers
./gerrit
./channel-scripts
./hydra

View file

@ -29,10 +29,6 @@ in
# NGINX should not give up super fast. Things can take time.
proxy_read_timeout 3600;
}
location = /robots.txt {
return 200 'User-agent: *\nAllow: /';
}
'';
};