feat: block automatically crawlers if the blocker is enabled #145
|
@ -23,6 +23,9 @@
|
|||
};
|
||||
};
|
||||
|
||||
# Block all these crawlers!!
|
||||
bagel.services.nginx.crawler-blocker.enable = true;
|
||||
|
||||
fileSystems."/gerrit-data" = {
|
||||
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
|
||||
fsType = "ext4";
|
||||
|
|
40
services/block-crawlers/blocked-ua.txt
Normal file
40
services/block-crawlers/blocked-ua.txt
Normal file
|
@ -0,0 +1,40 @@
|
|||
AI2Bot
|
||||
Ai2Bot-Dolma
|
||||
Amazonbot
|
||||
anthropic-ai
|
||||
Applebot
|
||||
Applebot-Extended
|
||||
Bytespider
|
||||
CCBot
|
||||
ChatGPT-User
|
||||
Claude-Web
|
||||
ClaudeBot
|
||||
cohere-ai
|
||||
Diffbot
|
||||
FacebookBot
|
||||
facebookexternalhit
|
||||
FriendlyCrawler
|
||||
Google-Extended
|
||||
GoogleOther
|
||||
GoogleOther-Image
|
||||
GoogleOther-Video
|
||||
GPTBot
|
||||
iaskspider/2.0
|
||||
ICC-Crawler
|
||||
ImagesiftBot
|
||||
img2dataset
|
||||
ISSCyberRiskCrawler
|
||||
Kangaroo Bot
|
||||
Meta-ExternalAgent
|
||||
Meta-ExternalFetcher
|
||||
OAI-SearchBot
|
||||
omgili
|
||||
omgilibot
|
||||
PerplexityBot
|
||||
PetalBot
|
||||
Scrapy
|
||||
Sidetrade indexer bot
|
||||
Timpibot
|
||||
VelenPublicWebCrawler
|
||||
Webzio-Extended
|
||||
YouBot
|
32
services/block-crawlers/default.nix
Normal file
32
services/block-crawlers/default.nix
Normal file
|
@ -0,0 +1,32 @@
|
|||
{ pkgs, config, lib, ... }:
|
||||
let
|
||||
inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
|
||||
cfg = config.bagel.services.nginx.crawler-blocker;
|
||||
mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
|
||||
${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
|
||||
Disallow: /
|
||||
'';
|
||||
in
|
||||
{
|
||||
options = {
|
||||
bagel.services.nginx.crawler-blocker = {
|
||||
enable = mkEnableOption "the crawler blocker";
|
||||
|
||||
userAgents = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
|
||||
};
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts = mkOption {
|
||||
type = types.attrsOf (types.submodule {
|
||||
config = {
|
||||
locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
|
||||
alias = mkRobotsFile cfg.userAgents;
|
||||
});
|
||||
};
|
||||
});
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
imports = [
|
||||
./block-crawlers
|
||||
./gerrit
|
||||
./channel-scripts
|
||||
./hydra
|
||||
|
|
|
@ -29,10 +29,6 @@ in
|
|||
# NGINX should not give up super fast. Things can take time.
|
||||
proxy_read_timeout 3600;
|
||||
}
|
||||
|
||||
location = /robots.txt {
|
||||
return 200 'User-agent: *\nAllow: /';
|
||||
}
|
||||
'';
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue