feat: block automatically crawlers if the blocker is enabled
This help us getting rid of useless traffic by crawlers. It is enabled for gerrit01 which is suffering the most from this. Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
This commit is contained in:
parent
d5500d7c4e
commit
23ddb4b34d
|
@ -23,6 +23,9 @@
|
|||
};
|
||||
};
|
||||
|
||||
# Block all these crawlers!!
|
||||
bagel.services.nginx.crawler-blocker.enable = true;
|
||||
|
||||
fileSystems."/gerrit-data" = {
|
||||
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
|
||||
fsType = "ext4";
|
||||
|
|
40
services/block-crawlers/blocked-ua.txt
Normal file
40
services/block-crawlers/blocked-ua.txt
Normal file
|
@ -0,0 +1,40 @@
|
|||
AI2Bot
|
||||
Ai2Bot-Dolma
|
||||
Amazonbot
|
||||
anthropic-ai
|
||||
Applebot
|
||||
Applebot-Extended
|
||||
Bytespider
|
||||
CCBot
|
||||
ChatGPT-User
|
||||
Claude-Web
|
||||
ClaudeBot
|
||||
cohere-ai
|
||||
Diffbot
|
||||
FacebookBot
|
||||
facebookexternalhit
|
||||
FriendlyCrawler
|
||||
Google-Extended
|
||||
GoogleOther
|
||||
GoogleOther-Image
|
||||
GoogleOther-Video
|
||||
GPTBot
|
||||
iaskspider/2.0
|
||||
ICC-Crawler
|
||||
ImagesiftBot
|
||||
img2dataset
|
||||
ISSCyberRiskCrawler
|
||||
Kangaroo Bot
|
||||
Meta-ExternalAgent
|
||||
Meta-ExternalFetcher
|
||||
OAI-SearchBot
|
||||
omgili
|
||||
omgilibot
|
||||
PerplexityBot
|
||||
PetalBot
|
||||
Scrapy
|
||||
Sidetrade indexer bot
|
||||
Timpibot
|
||||
VelenPublicWebCrawler
|
||||
Webzio-Extended
|
||||
YouBot
|
32
services/block-crawlers/default.nix
Normal file
32
services/block-crawlers/default.nix
Normal file
|
@ -0,0 +1,32 @@
|
|||
{ pkgs, config, lib, ... }:
|
||||
let
|
||||
inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault;
|
||||
cfg = config.bagel.services.nginx.crawler-blocker;
|
||||
mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
|
||||
${concatStringsSep "\n" (map (ua: "User-Agent: ${ua}") blockedUAs)}
|
||||
Disallow: /
|
||||
'';
|
||||
in
|
||||
{
|
||||
options = {
|
||||
bagel.services.nginx.crawler-blocker = {
|
||||
enable = mkEnableOption "the crawler blocker";
|
||||
|
||||
userAgents = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = builtins.split "\n" (builtins.readFile ./blocked-ua.txt);
|
||||
};
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts = mkOption {
|
||||
type = types.attrsOf (types.submodule {
|
||||
config = {
|
||||
locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
|
||||
alias = mkRobotsFile cfg.userAgents;
|
||||
});
|
||||
};
|
||||
});
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
imports = [
|
||||
./block-crawlers
|
||||
./gerrit
|
||||
./channel-scripts
|
||||
./hydra
|
||||
|
|
|
@ -29,10 +29,6 @@ in
|
|||
# NGINX should not give up super fast. Things can take time.
|
||||
proxy_read_timeout 3600;
|
||||
}
|
||||
|
||||
location = /robots.txt {
|
||||
return 200 'User-agent: *\nAllow: /';
|
||||
}
|
||||
'';
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in a new issue