feat: block automatically crawlers if the blocker is enabled #145

Merged
raito merged 1 commit from block-crawlers into main 2024-10-20 09:23:55 +00:00
5 changed files with 76 additions and 4 deletions

View file

@ -23,6 +23,9 @@
}; };
}; };
# Block all these crawlers!!
bagel.services.nginx.crawler-blocker.enable = true;
fileSystems."/gerrit-data" = { fileSystems."/gerrit-data" = {
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4"; device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
fsType = "ext4"; fsType = "ext4";

View file

@ -0,0 +1,40 @@
AI2Bot
Ai2Bot-Dolma
Amazonbot
anthropic-ai
Applebot
Applebot-Extended
Bytespider
CCBot
ChatGPT-User
Claude-Web
ClaudeBot
cohere-ai
Diffbot
FacebookBot
facebookexternalhit
FriendlyCrawler
Google-Extended
GoogleOther
GoogleOther-Image
GoogleOther-Video
GPTBot
iaskspider/2.0
ICC-Crawler
ImagesiftBot
img2dataset
ISSCyberRiskCrawler
Kangaroo Bot
Meta-ExternalAgent
Meta-ExternalFetcher
OAI-SearchBot
omgili
omgilibot
PerplexityBot
PetalBot
Scrapy
Sidetrade indexer bot
Timpibot
VelenPublicWebCrawler
Webzio-Extended
YouBot

View file

@ -0,0 +1,32 @@
{ pkgs, config, lib, ... }:
let
inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
cfg = config.bagel.services.nginx.crawler-blocker;
mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
Disallow: /
'';
in
{
options = {
bagel.services.nginx.crawler-blocker = {
enable = mkEnableOption "the crawler blocker";
userAgents = mkOption {
type = types.listOf types.str;
default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
};
};
services.nginx.virtualHosts = mkOption {
type = types.attrsOf (types.submodule {
config = {
locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
alias = mkRobotsFile cfg.userAgents;
});
};
});
};
};
}

View file

@ -1,5 +1,6 @@
{ {
imports = [ imports = [
./block-crawlers
./gerrit ./gerrit
./channel-scripts ./channel-scripts
./hydra ./hydra

View file

@ -29,10 +29,6 @@ in
# NGINX should not give up super fast. Things can take time. # NGINX should not give up super fast. Things can take time.
proxy_read_timeout 3600; proxy_read_timeout 3600;
} }
location = /robots.txt {
return 200 'User-agent: *\nAllow: /';
}
''; '';
}; };