diff options
-rw-r--r-- | TODO.md | 6 | ||||
-rw-r--r-- | static/robots.txt | 52 |
2 files changed, 57 insertions, 1 deletions
@@ -276,7 +276,7 @@ - screen - mutt/offlineimap/notmuch - irb (show irbrc w/3.x mods) - - git (gitconfig w aliases) + - git (gitconfig w/aliases) - bash? - perf - wireguard @@ -285,6 +285,8 @@ - postgres, sqlite - firefox (ublock origin, tab stash, firefox sync) - gnome (extensions: hidetopbar, workspace matrix) + - podman + - mtr - programming languages - go - ruby @@ -318,6 +320,7 @@ - my tirade in LWM comments: <https://lwn.net/Articles/981334/> - <https://connect.mozilla.org/t5/discussions/remove-quot-privacy-preserving-attribution-quot-ad-measurement/m-p/62638/highlight/true#M22002> - <https://arstechnica.com/gadgets/2024/07/google-will-not-disable-tracking-cookies-in-chrome-after-years-of-trying/> + - summary, good tips: <https://blog.zgp.org/turn-off-advertising-features-in-firefox/> - nonsense might be in good faith: upton sinclair "it's difficult to get a man to understand something when his job depends on him not understanding it"; both google and mozilla depend on advertising @@ -539,6 +542,7 @@ - privacy policy: <https://www.jwz.org/blog/2024/06/your-personal-information-is-very-important-to-us/> - <https://arstechnica.com/gadgets/2024/08/nova-launcher-savior-of-cruft-filled-android-phones-is-on-life-support/> +- software: <https://www.lawfaremedia.org/article/the-crowdstrike-outage-and-market-driven-brittleness> ## done - add project folders diff --git a/static/robots.txt b/static/robots.txt new file mode 100644 index 0000000..1b9dcbf --- /dev/null +++ b/static/robots.txt @@ -0,0 +1,52 @@ +# block LLM bots by user-agent +# src: https://robotstxt.com/ai +# updated: 2025-01-30 +User-Agent: GPTBot +User-Agent: ClaudeBot +User-Agent: Claude-Web +User-Agent: CCBot +User-Agent: Googlebot-Extended +User-Agent: Applebot-Extended +User-Agent: Facebookbot +User-Agent: Meta-ExternalAgent +User-Agent: Meta-ExternalFetcher +User-Agent: diffbot +User-Agent: PerplexityBot +User-Agent: Omgili +User-Agent: Omgilibot +User-Agent: webzio-extended +User-Agent: ImagesiftBot +User-Agent: Bytespider +User-Agent: Amazonbot +User-Agent: Youbot +User-Agent: SemrushBot-OCOB +User-Agent: Petalbot +User-Agent: VelenPublicWebCrawler +User-Agent: TurnitinBot +User-Agent: Timpibot +User-Agent: OAI-SearchBot +User-Agent: ICC-Crawler +User-Agent: AI2Bot +User-Agent: AI2Bot-Dolma +User-Agent: DataForSeoBot +User-Agent: AwarioBot +User-Agent: AwarioSmartBot +User-Agent: AwarioRssBot +User-Agent: Google-CloudVertexBot +User-Agent: PanguBot +User-Agent: Kangaroo Bot +User-Agent: Sentibot +User-Agent: img2dataset +User-Agent: Meltwater +User-Agent: Seekr +User-Agent: peer39_crawler +User-Agent: cohere-ai +User-Agent: cohere-training-data-crawler +User-Agent: DuckAssistBot +User-Agent: Scrapy +Disallow: / +DisallowAITraining: / + +# block non-specific LLM bots (note: experimental directive) +User-Agent: * +DisallowAITraining: / |