Make bot regexes case-insensitive
authorLanius Trolling <lanius@laniustrolling.dev>
Sat, 27 Jan 2024 14:16:05 +0000 (09:16 -0500)
committerLanius Trolling <lanius@laniustrolling.dev>
Sat, 27 Jan 2024 14:17:50 +0000 (09:17 -0500)
src/jvmMain/kotlin/info/mechyrdia/data/visits.kt

index 9ee6459f38aa53a29ca9319acfee0cc6128a970c..4ab2a91b7a71a73a18a681f7b443968e63386459 100644 (file)
@@ -15,6 +15,7 @@ import kotlinx.html.p
 import kotlinx.html.style
 import kotlinx.serialization.SerialName
 import kotlinx.serialization.Serializable
+import org.intellij.lang.annotations.Language
 import java.security.MessageDigest
 import java.time.Instant
 
@@ -125,183 +126,186 @@ fun FlowContent.guestbook(totalsData: PageVisitTotals) {
 }
 
 object RobotDetector {
+       private fun botRegex(@Language("RegExp") regex: String) = Regex(regex, RegexOption.IGNORE_CASE)
+       
        private val botRegexes = listOf(
-               Regex(" daum[ /]"),
-               Regex(" deusu/"),
-               Regex(" yadirectfetcher"),
-               Regex("(?:^| )site"),
-               Regex("(?:^|[^g])news"),
-               Regex("@[a-z]"),
-               Regex("\\(at\\)[a-z]"),
-               Regex("\\(github\\.com/"),
-               Regex("\\[at][a-z]"),
-               Regex("^12345"),
-               Regex("^<"),
-               Regex("^[\\w .\\-()]+(/v?\\d+(\\.\\d+)?(\\.\\d{1,10})?)?$"),
-               Regex("^[^ ]{50,}$"),
-               Regex("^active"),
-               Regex("^ad muncher"),
-               Regex("^amaya"),
-               Regex("^anglesharp/"),
-               Regex("^anonymous"),
-               Regex("^avsdevicesdk/"),
-               Regex("^axios/"),
-               Regex("^bidtellect/"),
-               Regex("^biglotron"),
-               Regex("^btwebclient/"),
-               Regex("^castro"),
-               Regex("^clamav[ /]"),
-               Regex("^client/"),
-               Regex("^cobweb/"),
-               Regex("^coccoc"),
-               Regex("^custom"),
-               Regex("^ddg[_-]android"),
-               Regex("^discourse"),
-               Regex("^dispatch/\\d"),
-               Regex("^downcast/"),
-               Regex("^duckduckgo"),
-               Regex("^facebook"),
-               Regex("^fdm[ /]\\d"),
-               Regex("^getright/"),
-               Regex("^gozilla/"),
-               Regex("^hatena"),
-               Regex("^hobbit"),
-               Regex("^hotzonu"),
-               Regex("^hwcdn/"),
-               Regex("^jeode/"),
-               Regex("^jetty/"),
-               Regex("^jigsaw"),
-               Regex("^linkdex"),
-               Regex("^lwp[-: ]"),
-               Regex("^metauri"),
-               Regex("^microsoft bits"),
-               Regex("^movabletype"),
-               Regex("^mozilla/\\d\\.\\d \\(compatible;?\\)$"),
-               Regex("^mozilla/\\d\\.\\d \\w*$"),
-               Regex("^navermailapp"),
-               Regex("^netsurf"),
-               Regex("^offline explorer"),
-               Regex("^php"),
-               Regex("^postman"),
-               Regex("^postrank"),
-               Regex("^python"),
-               Regex("^read"),
-               Regex("^reed"),
-               Regex("^restsharp/"),
-               Regex("^snapchat"),
-               Regex("^space bison"),
-               Regex("^svn"),
-               Regex("^swcd "),
-               Regex("^taringa"),
-               Regex("^test certificate info"),
-               Regex("^thumbor/"),
-               Regex("^tumblr/"),
-               Regex("^user-agent:mozilla"),
-               Regex("^valid"),
-               Regex("^venus/fedoraplanet"),
-               Regex("^w3c"),
-               Regex("^webbandit/"),
-               Regex("^webcopier"),
-               Regex("^wget"),
-               Regex("^whatsapp"),
-               Regex("^xenu link sleuth"),
-               Regex("^yahoo"),
-               Regex("^yandex"),
-               Regex("^zdm/\\d"),
-               Regex("^zoom marketplace/"),
-               Regex("^\\{\\{.*\\}\\}$"),
-               Regex("adbeat\\.com"),
-               Regex("appinsights"),
-               Regex("archive"),
-               Regex("ask jeeves/teoma"),
-               Regex("bit\\.ly/"),
-               Regex("bluecoat drtr"),
-               Regex("bot"),
-               Regex("browsex"),
-               Regex("burpcollaborator"),
-               Regex("capture"),
-               Regex("catch"),
-               Regex("check"),
-               Regex("chrome-lighthouse"),
-               Regex("chromeframe"),
-               Regex("cloud"),
-               Regex("crawl"),
-               Regex("cryptoapi"),
-               Regex("dareboost"),
-               Regex("datanyze"),
-               Regex("dataprovider"),
-               Regex("dejaclick"),
-               Regex("dmbrowser"),
-               Regex("download"),
-               Regex("evc-batch/"),
-               Regex("feed"),
-               Regex("firephp"),
-               Regex("freesafeip"),
-               Regex("ghost"),
-               Regex("gomezagent"),
-               Regex("google"),
-               Regex("headlesschrome/"),
-               Regex("http"),
-               Regex("httrack"),
-               Regex("hubspot marketing grader"),
-               Regex("hydra"),
-               Regex("ibisbrowser"),
-               Regex("images"),
-               Regex("iplabel"),
-               Regex("ips-agent"),
-               Regex("java"),
-               Regex("library"),
-               Regex("mail\\.ru/"),
-               Regex("manager"),
-               Regex("monitor"),
-               Regex("morningscore/"),
-               Regex("neustar wpm"),
-               Regex("nutch"),
-               Regex("offbyone"),
-               Regex("optimize"),
-               Regex("pageburst"),
-               Regex("pagespeed"),
-               Regex("perl"),
-               Regex("phantom"),
-               Regex("pingdom"),
-               Regex("powermarks"),
-               Regex("preview"),
-               Regex("proxy"),
-               Regex("ptst[ /]\\d"),
-               Regex("reader"),
-               Regex("rexx;"),
-               Regex("rigor"),
-               Regex("rss"),
-               Regex("scan"),
-               Regex("scrape"),
-               Regex("search"),
-               Regex("serp ?reputation ?management"),
-               Regex("server"),
-               Regex("sogou"),
-               Regex("sparkler/"),
-               Regex("speedcurve"),
-               Regex("spider"),
-               Regex("splash"),
-               Regex("statuscake"),
-               Regex("stumbleupon\\.com"),
-               Regex("supercleaner"),
-               Regex("synapse"),
-               Regex("synthetic"),
-               Regex("taginspector/"),
-               Regex("torrent"),
-               Regex("tracemyfile"),
-               Regex("transcoder"),
-               Regex("trendsmapresolver"),
-               Regex("twingly recon"),
-               Regex("url"),
-               Regex("virtuoso"),
-               Regex("wappalyzer"),
-               Regex("webglance"),
-               Regex("webkit2png"),
-               Regex("websitemetadataretriever"),
-               Regex("whatcms/"),
-               Regex("wordpress"),
-               Regex("zgrab"),
+               botRegex(" daum[ /]"),
+               botRegex(" deusu/"),
+               botRegex(" yadirectfetcher"),
+               botRegex("(?:^| )site"),
+               botRegex("(?:^|[^g])news"),
+               botRegex("@[a-z]"),
+               botRegex("\\(at\\)[a-z]"),
+               botRegex("\\(github\\.com/"),
+               botRegex("\\[at][a-z]"),
+               botRegex("^12345"),
+               botRegex("^<"),
+               botRegex("^[\\w .\\-()]+(/v?\\d+(\\.\\d+)?(\\.\\d{1,10})?)?$"),
+               botRegex("^[^ ]{50,}$"),
+               botRegex("^active"),
+               botRegex("^ad muncher"),
+               botRegex("^amaya"),
+               botRegex("^anglesharp/"),
+               botRegex("^anonymous"),
+               botRegex("^avsdevicesdk/"),
+               botRegex("^axios/"),
+               botRegex("^bidtellect/"),
+               botRegex("^biglotron"),
+               botRegex("^btwebclient/"),
+               botRegex("^castro"),
+               botRegex("^clamav[ /]"),
+               botRegex("^client/"),
+               botRegex("^cobweb/"),
+               botRegex("^coccoc"),
+               botRegex("^custom"),
+               botRegex("^ddg[_-]android"),
+               botRegex("^discourse"),
+               botRegex("^dispatch/\\d"),
+               botRegex("^downcast/"),
+               botRegex("^duckduckgo"),
+               botRegex("^facebook"),
+               botRegex("^fdm[ /]\\d"),
+               botRegex("^getright/"),
+               botRegex("^gozilla/"),
+               botRegex("^hatena"),
+               botRegex("^hobbit"),
+               botRegex("^hotzonu"),
+               botRegex("^hwcdn/"),
+               botRegex("^jeode/"),
+               botRegex("^jetty/"),
+               botRegex("^jigsaw"),
+               botRegex("^linkdex"),
+               botRegex("^lwp[-: ]"),
+               botRegex("^metauri"),
+               botRegex("^microsoft bits"),
+               botRegex("^movabletype"),
+               botRegex("^mozilla/\\d\\.\\d \\(compatible;?\\)$"),
+               botRegex("^mozilla/\\d\\.\\d \\w*$"),
+               botRegex("^navermailapp"),
+               botRegex("^netsurf"),
+               botRegex("^offline explorer"),
+               botRegex("^php"),
+               botRegex("^postman"),
+               botRegex("^postrank"),
+               botRegex("^python"),
+               botRegex("^read"),
+               botRegex("^reed"),
+               botRegex("^restsharp/"),
+               botRegex("^snapchat"),
+               botRegex("^space bison"),
+               botRegex("^svn"),
+               botRegex("^swcd "),
+               botRegex("^taringa"),
+               botRegex("^test certificate info"),
+               botRegex("^thumbor/"),
+               botRegex("^tumblr/"),
+               botRegex("^user-agent:mozilla"),
+               botRegex("^valid"),
+               botRegex("^venus/fedoraplanet"),
+               botRegex("^w3c"),
+               botRegex("^webbandit/"),
+               botRegex("^webcopier"),
+               botRegex("^wget"),
+               botRegex("^whatsapp"),
+               botRegex("^xenu link sleuth"),
+               botRegex("^yahoo"),
+               botRegex("^yandex"),
+               botRegex("^zdm/\\d"),
+               botRegex("^zoom marketplace/"),
+               botRegex("^\\{\\{.*}}$"),
+               botRegex("adbeat\\.com"),
+               botRegex("appinsights"),
+               botRegex("archive"),
+               botRegex("ask jeeves/teoma"),
+               botRegex("bit\\.ly/"),
+               botRegex("bluecoat drtr"),
+               botRegex("bot"),
+               botRegex("browsex"),
+               botRegex("burpcollaborator"),
+               botRegex("capture"),
+               botRegex("catch"),
+               botRegex("check"),
+               botRegex("chrome-lighthouse"),
+               botRegex("chromeframe"),
+               botRegex("cloud"),
+               botRegex("crawl"),
+               botRegex("cryptoapi"),
+               botRegex("dareboost"),
+               botRegex("datanyze"),
+               botRegex("dataprovider"),
+               botRegex("dejaclick"),
+               botRegex("dmbrowser"),
+               botRegex("download"),
+               botRegex("evc-batch/"),
+               botRegex("feed"),
+               botRegex("firephp"),
+               botRegex("freesafeip"),
+               botRegex("ghost"),
+               botRegex("gomezagent"),
+               botRegex("google"),
+               botRegex("headlesschrome/"),
+               botRegex("http"),
+               botRegex("httrack"),
+               botRegex("hubspot marketing grader"),
+               botRegex("hydra"),
+               botRegex("ibisbrowser"),
+               botRegex("images"),
+               botRegex("iplabel"),
+               botRegex("ips-agent"),
+               botRegex("java"),
+               botRegex("library"),
+               botRegex("mail\\.ru/"),
+               botRegex("manager"),
+               botRegex("monitor"),
+               botRegex("morningscore/"),
+               botRegex("neustar wpm"),
+               botRegex("nutch"),
+               botRegex("offbyone"),
+               botRegex("optimize"),
+               botRegex("pageburst"),
+               botRegex("pagespeed"),
+               botRegex("perl"),
+               botRegex("phantom"),
+               botRegex("pingdom"),
+               botRegex("powermarks"),
+               botRegex("preview"),
+               botRegex("proxy"),
+               botRegex("ptst[ /]\\d"),
+               botRegex("rainmeter webparser plugin"),
+               botRegex("reader"),
+               botRegex("rexx;"),
+               botRegex("rigor"),
+               botRegex("rss"),
+               botRegex("scan"),
+               botRegex("scrape"),
+               botRegex("search"),
+               botRegex("serp ?reputation ?management"),
+               botRegex("server"),
+               botRegex("sogou"),
+               botRegex("sparkler/"),
+               botRegex("speedcurve"),
+               botRegex("spider"),
+               botRegex("splash"),
+               botRegex("statuscake"),
+               botRegex("stumbleupon\\.com"),
+               botRegex("supercleaner"),
+               botRegex("synapse"),
+               botRegex("synthetic"),
+               botRegex("taginspector/"),
+               botRegex("torrent"),
+               botRegex("tracemyfile"),
+               botRegex("transcoder"),
+               botRegex("trendsmapresolver"),
+               botRegex("twingly recon"),
+               botRegex("url"),
+               botRegex("virtuoso"),
+               botRegex("wappalyzer"),
+               botRegex("webglance"),
+               botRegex("webkit2png"),
+               botRegex("websitemetadataretriever"),
+               botRegex("whatcms/"),
+               botRegex("wordpress"),
+               botRegex("zgrab"),
        )
        
        fun isRobot(userAgent: String?) = userAgent == null || botRegexes.any { it.containsMatchIn(userAgent) }