symys-add-loggedin-dontblock #1

Open
symys wants to merge 8 commits from symys/forgejo-crawler-blocker:symys-add-loggedin-dontblock into main
17 changed files with 8898 additions and 99857 deletions

View file

@ -1,13 +1,23 @@
# forgejo-crawler-blocker # forgejo-crawler-blocker
What does a GPT training web-crawler see when it tries to access our forgejo instance and look at every single file at every single commit, ignoring `robots.txt` and sending a generic user-agent header?
What can
I say except
Hail Eris,
All Hail Discordia!
# maintenance
if anyone needs to clear the data to unblock someone, these are the commands to run on paimon: if anyone needs to clear the data to unblock someone, these are the commands to run on paimon:
`sudo -i` `sudo -i`
``` ```
docker stop gitea_forgejo-crawler-blocker_1 docker stop gitea_forgejo-crawler-blocker
rm /etc/docker-compose/gitea/forgejo-crawler-blocker/traffic.db rm /etc/docker-compose/gitea/forgejo-crawler-blocker/traffic.db
docker start gitea_forgejo-crawler-blocker_1 docker start gitea_forgejo-crawler-blocker
``` ```
# persistent data storage # persistent data storage

101229
bible.txt

File diff suppressed because it is too large Load diff

View file

@ -1,15 +1,20 @@
{ {
"UpstreamHostname": "gitea", "UpstreamHostname": "localhost",
"UpstreamPort": 3000, "UpstreamPort": 3000,
"ListenPort": 3002, "ListenPort": 3002,
"TarpitAfterRecentRobotsTxtViolations": 100, "TarpitAfterRecentRobotsTxtViolations": 100,
"TarpitAfterRobotsTxtViolationsPerSecond": 0.1, "TarpitAfterRobotsTxtViolationsPerSecond": 20.0,
"GlobalRateLimit": 5.0,
"AllowedPaths": [
"/respond303",
"respond200RobotsAllowed"
],
"GlobalVerbosityLevel":5,
"BlockedUserAgentRegexes": [ "BlockedUserAgentRegexes": [
"Python/[^ ]+ aiohttp/.*" "Python/[^ ]+ aiohttp/.*"
], ],
"WhitelistUserAgentRegexes": [ "WhitelistUserAgentRegexes": [
"^git/[0-9\\.]*$", "^git/[0-9\\.]*$",
"^JGit/[0-9a-z\\.]*$",
"https://github.com/matrix-org/synapse" "https://github.com/matrix-org/synapse"
], ],
"BlockedSourceIpPrefixes": [ "BlockedSourceIpPrefixes": [

198
crawlpit_test.go Normal file
View file

@ -0,0 +1,198 @@
package main
import (
"fmt"
"log"
"net/http"
"testing"
"time"
)
type EditableUserAgentTransport struct {
CustomUserAgent string
UnderlyingTransport http.RoundTripper
}
func NewEditableUserAgentTransport(desiredUserAgent string) EditableUserAgentTransport {
transportOut := EditableUserAgentTransport{
CustomUserAgent: desiredUserAgent,
UnderlyingTransport: http.DefaultTransport,
}
return transportOut
}
func (t *EditableUserAgentTransport) RoundTrip(request *http.Request) (*http.Response, error) {
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
request.Header["User-Agent"] = []string{t.CustomUserAgent}
response, err := t.UnderlyingTransport.RoundTrip(request)
if err != nil {
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
return nil, err
}
return response, nil
}
/*
func TestGetRobotsDotTxt(t *testing.T) {
fmt.Println("Running test TestGetRobotsDotTxt")
//It's assumed configureSelf() will take longer than setting up
//the server in runSimpleServer(), so no additional logic has been
//added to ensure the server is running before attempting to use it
//Should be fine, as it errs on the side of failure anyway.
go runSimpleServer(":3000", "TestGetRobotsDotTxt")
configureSelf()
thisRobots := getRobotsDotTxt()
robotsMsg := fmt.Sprintf("robots.txt: %v\n", thisRobots)
log.Printf(robotsMsg)
thisShouldMatch := "http://localhost:3002/respond200"
thisShouldNotMatch := "https://localhost:3002/respond303"
doesActuallyMatch := false
shouldNotMatchButDoes := false
for _, r := range thisRobots {
if r.MatchString(thisShouldMatch) {
doesActuallyMatch = true
}
if r.MatchString(thisShouldNotMatch) {
shouldNotMatchButDoes = true
}
}
if !doesActuallyMatch {
t.Errorf("Pattern %v should match robots regexp but does not.", thisShouldMatch)
}
if shouldNotMatchButDoes {
t.Errorf("Pattern %v should NOT match robots regexp, but it does.", thisShouldNotMatch)
}
}
*/
type TestCase struct {
Name string
TestURL string
StatusWant int
CustomClient http.Client
SleepTime time.Duration
}
func (rb TestCase) DoRequestStuff(t *testing.T) (measuredFetchTime time.Duration) {
testURL := rb.TestURL
testURLMsg := fmt.Sprintf("testURL: %v", testURL)
verbosityLevelLogPrint(testURLMsg, 5)
thisClient := rb.CustomClient
timeBeforeGet := time.Now()
thisResponse, err := thisClient.Get(testURL)
measuredFetchTime = time.Since(timeBeforeGet)
statusWant := rb.StatusWant
statusGot := thisResponse.StatusCode
if err != nil {
t.Errorf("Error getting %s: %v", testURL, err)
} else if statusWant != statusGot {
t.Errorf("Test `%v` should fail: Should have gotten status %v but got %v!", rb.Name, statusWant, statusGot)
}
fmt.Printf("thisResponse: %v\n", thisResponse)
time.Sleep(rb.SleepTime)
return
}
func TestMain(t *testing.T) {
GLOBAL_VERBOSITY = 5
fmt.Println("Running test TestMain")
go runSimpleServer(":3000", "TestMain")
commonRealUserAgent := "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
thisTransport := NewEditableUserAgentTransport(commonRealUserAgent)
thisClient := http.Client{Transport: &thisTransport}
go main()
//Sleep while main sets up the reverse proxy
//Inelegant, but errs on the side of failure
//so its okay
time.Sleep(1 * time.Second)
testCaseNormalUserLike := TestCase{
Name: "Normal User test case",
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
testCaseNormalUserLike.DoRequestStuff(t)
testCaseMaliciousUser := TestCase{
Name: "Malicious user test case",
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
numRequestsDone := 0
testDuration := time.Duration(0)
for i := 0; i < 20; i++ {
testDuration += testCaseMaliciousUser.DoRequestStuff(t)
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
verbosityLevelLogPrint(durationMsg, 5)
numRequestsDone++
}
testRate := float64(numRequestsDone) / testDuration.Seconds()
totalDurationMsg := fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
verbosityLevelLogPrint(totalDurationMsg, 5)
if testRate > GLOBAL_RATE_LIMIT+1000 {
t.Errorf("MaliciousUser test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
}
sillyFakeUserAgent := "I am 3vil and undetectable 8D"
thisTransport = NewEditableUserAgentTransport(sillyFakeUserAgent)
thisClient = http.Client{Transport: &thisTransport}
maxNumRobotsViolations := config.TarpitAfterRecentRobotsTxtViolations
desiredRateLimit := config.TarpitAfterRobotsTxtViolationsPerSecond
neededSleepTime := 1000.0 / (desiredRateLimit + 1.0)
rateLimitMsg := fmt.Sprintf("desiredRateLimit: %v, so sleep time should be %v", desiredRateLimit, neededSleepTime)
verbosityLevelLogPrint(rateLimitMsg, 5)
testCaseRespectfulBotLike := TestCase{
Name: "Respectful bot test case",
TestURL: "http://localhost:3002/respond200",
StatusWant: 403,
CustomClient: thisClient,
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
}
for reqIndex := 0; reqIndex < maxNumRobotsViolations-21; reqIndex++ {
testCaseRespectfulBotLike.DoRequestStuff(t)
}
testCaseRespectfulBotPassesThreshold := TestCase{
Name: "Test case which should be the first to pass the tarpit threshold.",
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
}
testCaseRespectfulBotPassesThreshold.DoRequestStuff(t)
testCaseUserLoggingIn := TestCase{
Name: "Test case case to check that a user can visit explicitly allowed page",
TestURL: "http://localhost:3002/respond303",
StatusWant: 303,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
testCaseUserLoggingIn.DoRequestStuff(t)
testCaseUserAfterLoggedIn := TestCase{
Name: "Test case to check for already logged-in user",
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
testCaseUserAfterLoggedIn.DoRequestStuff(t)
}

17
go.mod
View file

@ -1,11 +1,14 @@
module git.sequentialread.com/forest/forgejo-crawler-blocker module crawlpit
go 1.19 go 1.21.6
require ( require (
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04 // indirect git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04
github.com/boltdb/bolt v1.3.1 // indirect github.com/boltdb/bolt v1.3.1
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect zgo.at/isbot v1.0.0
golang.org/x/sys v0.16.0 // indirect )
zgo.at/isbot v1.0.0 // indirect
require (
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect
golang.org/x/sys v0.17.0 // indirect
) )

4
go.sum
View file

@ -4,7 +4,7 @@ github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4=
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps= github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c h1:HelZ2kAFadG0La9d+4htN4HzQ68Bm2iM9qKMSMES6xg= github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c h1:HelZ2kAFadG0La9d+4htN4HzQ68Bm2iM9qKMSMES6xg=
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c/go.mod h1:JlzghshsemAMDGZLytTFY8C1JQxQPhnatWqNwUXjggo= github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c/go.mod h1:JlzghshsemAMDGZLytTFY8C1JQxQPhnatWqNwUXjggo=
golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU= golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
zgo.at/isbot v1.0.0 h1:wM+40WO2LudR5352eCAQlHQdWsdNVpngB8Oe1qlTpdM= zgo.at/isbot v1.0.0 h1:wM+40WO2LudR5352eCAQlHQdWsdNVpngB8Oe1qlTpdM=
zgo.at/isbot v1.0.0/go.mod h1:/w+LTWbHsV7G5fT41VnwM0kTdk1H/prynfMqaxcBWjA= zgo.at/isbot v1.0.0/go.mod h1:/w+LTWbHsV7G5fT41VnwM0kTdk1H/prynfMqaxcBWjA=

44
logicalDiagram.mermaid Normal file
View file

@ -0,0 +1,44 @@
stateDiagram-v2
direction TB
accTitle: crawler-blocker Diagram
accDescr: Diagram of main.go
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
class nixShellEnvironment nixShell
class setConfigSubroutine, nixShellExitSubroutine subroutine
class mainFunction systemShell
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
preEntryState --> mainFunction
setConfigSubroutine --> setConfigSubroutine
setConfigSubroutine --> nixShellEnvironment
nixShellEnvironment --> nixShellExitSubroutine
mainFunction --> postExitState
state "Define types. For convenience and clarity." as preEntryState
state "Define global variables and constants. These make it easier to pass data between some of the other functions below." as preEntryState
state "entrypoint to main" as preEntryState
state "main exits and control is returned to the calling environment" as postExitState
state "main" as mainFunction {
state "configureSelf() is called" as callToConfigSelf {
state "Configuration Subroutine" as setConfigSubroutine
state "Import config from file, panic if error occurs" as setConfigSubroutine
state "Check that config parameters are within acceptable values. Exit if not." as setConfigSubroutine
state "DataBase Stat and Initialization Subroutine" as dbStatSubroutine
state "stat ./data directory. Create it if it doesn't exist." as dbStatSubroutine
}
state "call http.HandlerFunc(\"/\", whackyHackyHandlerFunc)" as handleFuncCall
state "Register whackyHackyHandlerFunc with pattern / to the DefaultServMux." as handleFuncCall
state "The / pattern basically matches everything from the root of the URL path." as handleFuncCall
state "So here, all incoming requests should be handled by this function." as handleFuncCall
}

313
main.go
View file

@ -21,13 +21,43 @@ import (
isbot "zgo.at/isbot" isbot "zgo.at/isbot"
) )
type ResponseCodeReportingTransport struct {
UnderlyingTransport http.RoundTripper
ReportingChannel chan string
}
var AllowIPChan chan string
func (t *ResponseCodeReportingTransport) RoundTrip(request *http.Request) (*http.Response, error) {
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
response, err := t.UnderlyingTransport.RoundTrip(request)
if err != nil {
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
return nil, err
}
verbosityLevelLogPrint("About to check status code for 303 within custom RoundTrip implementation", 5)
if response.StatusCode == http.StatusSeeOther {
verbosityLevelLogPrint("Found status 303, sending to channel", 5)
remoteIPNoPort := request.RemoteAddr[:strings.LastIndex(request.RemoteAddr, ":")]
AllowIPChan <- remoteIPNoPort
mapMsg := fmt.Sprintf("Sent the following IP to AllowIPChan: %v", remoteIPNoPort)
verbosityLevelLogPrint(mapMsg, 5)
}
verbosityLevelLogPrint("Checked status code for 303.", 5)
return response, nil
}
type Config struct { type Config struct {
UpstreamHostname string UpstreamHostname string
UpstreamPort int UpstreamPort int
ListenPort int ListenPort int
GlobalRateLimit float64
AllowedPaths []string
GlobalVerbosityLevel int
TarpitAfterRecentRobotsTxtViolations int TarpitAfterRecentRobotsTxtViolations int
TarpitAfterRobotsTxtViolationsPerSecond float64 TarpitAfterRobotsTxtViolationsPerSecond float64
HardBlockBasedOnRobotsTxt bool
WhitelistUserAgentRegexes []string WhitelistUserAgentRegexes []string
BlockedUserAgentRegexes []string BlockedUserAgentRegexes []string
BlockedSourceIpPrefixes []BlockedSourceIpPrefix BlockedSourceIpPrefixes []BlockedSourceIpPrefix
@ -40,22 +70,63 @@ type BlockedSourceIpPrefix struct {
} }
type RequestsLog struct { type RequestsLog struct {
AllowEverywhere bool `json:allow_everywhere`
TotalRequests int64 `json:total_requests` TotalRequests int64 `json:total_requests`
NumAveragedRequests int64 `json:averaged_requests`
RecentRobotsTxtViolations int64 `json:recent_requests` RecentRobotsTxtViolations int64 `json:recent_requests`
RollingAvgRequestRate float64 `json:rolling_average_request_rate` RollingAvgRequestRate float64 `json:rolling_average_request_rate`
TimeOfLastRequest time.Time `json:time_of_last_request` TimeOfLastRequest time.Time `json:time_of_last_request`
TimeOfFirstRequest time.Time `json:time_of_first_request`
Blocked time.Time `json:time_of_blocked` Blocked time.Time `json:time_of_blocked`
} }
type LoggedInMap map[string]time.Time
var BlockedUserAgentRegexes []*regexp.Regexp var BlockedUserAgentRegexes []*regexp.Regexp
var WhitelistUserAgentRegexes []*regexp.Regexp var WhitelistUserAgentRegexes []*regexp.Regexp
var ExplicitlyAllowedPaths []*regexp.Regexp
var robotsTxtDisallow []*regexp.Regexp
var db *bolt.DB var db *bolt.DB
var config Config var config Config
var GLOBAL_VERBOSITY int
var GLOBAL_RATE_LIMIT float64 //in requests per second
const MyBlockedSourceIpPrefixIsBotReasonOffset = 40 const MyBlockedSourceIpPrefixIsBotReasonOffset = 40
func main() { func main() {
AllowIPChan = make(chan string, 10)
verbosityLevelLogPrint(fmt.Sprintf("GLOBAL_VERBOSITY set to: %v", GLOBAL_VERBOSITY), 5)
verbosityLevelLogPrint("Before call to configureSelf()", 5)
configureSelf()
verbosityLevelLogPrint("After call to configureSelf()", 5)
verbosityLevelLogPrint("Before opening database", 5)
var err error
db, err = bolt.Open("data/traffic.db", 0600, nil)
if err != nil {
log.Fatal(err)
}
defer db.Close()
verbosityLevelLogPrint("After opening database", 5)
verbosityLevelLogPrint("Before call to getRobotsDotTxt()", 5)
robotsTxtDisallow = getRobotsDotTxt()
verbosityLevelLogPrint("After call to getRobotsDotTxt()", 5)
verbosityLevelLogPrint("Registering whackyHackyHandlerFunc as root handler", 5)
http.HandleFunc("/", whackyHackyHandlerFunc)
listenMessage := fmt.Sprintf("im about to try to listen on port %d and forward to http://%s:%d!\n",
config.ListenPort, config.UpstreamHostname, config.UpstreamPort)
verbosityLevelLogPrint(listenMessage, 1)
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", config.ListenPort), nil))
}
func configureSelf() {
config = Config{} config = Config{}
ignoreCommandlineFlags := []string{} ignoreCommandlineFlags := []string{}
err := configlite.ReadConfiguration("config.json", "FCB", ignoreCommandlineFlags, reflect.ValueOf(&config)) err := configlite.ReadConfiguration("config.json", "FCB", ignoreCommandlineFlags, reflect.ValueOf(&config))
@ -66,11 +137,14 @@ func main() {
configBytes, _ := json.MarshalIndent(config, "", " ") configBytes, _ := json.MarshalIndent(config, "", " ")
log.Println(string(configBytes)) log.Println(string(configBytes))
GLOBAL_VERBOSITY = config.GlobalVerbosityLevel
GLOBAL_RATE_LIMIT = config.GlobalRateLimit
if config.TarpitAfterRecentRobotsTxtViolations < 10 { if config.TarpitAfterRecentRobotsTxtViolations < 10 {
log.Fatalf("TarpitAfterRecentRobotsTxtViolations (%d) must be greater than 10", config.TarpitAfterRecentRobotsTxtViolations) log.Fatalf("TarpitAfterRecentRobotsTxtViolations (%d) must be greater than 10", config.TarpitAfterRecentRobotsTxtViolations)
} }
if config.TarpitAfterRobotsTxtViolationsPerSecond < 0.001 { if config.TarpitAfterRobotsTxtViolationsPerSecond < 0.001 {
log.Fatalf("TarpitAfterRobotsTxtViolationsPerSecond (%d) must be greater than 0.001", config.TarpitAfterRobotsTxtViolationsPerSecond) log.Fatalf("TarpitAfterRobotsTxtViolationsPerSecond (%f) must be greater than 0.001", config.TarpitAfterRobotsTxtViolationsPerSecond)
} }
_, err = os.Stat("data") _, err = os.Stat("data")
@ -82,12 +156,6 @@ func main() {
} }
} }
db, err = bolt.Open("data/traffic.db", 0600, nil)
if err != nil {
log.Fatal(err)
}
defer db.Close()
BlockedUserAgentRegexes = []*regexp.Regexp{} BlockedUserAgentRegexes = []*regexp.Regexp{}
for _, r := range config.BlockedUserAgentRegexes { for _, r := range config.BlockedUserAgentRegexes {
BlockedUserAgentRegexes = append(BlockedUserAgentRegexes, regexp.MustCompile(r)) BlockedUserAgentRegexes = append(BlockedUserAgentRegexes, regexp.MustCompile(r))
@ -98,43 +166,55 @@ func main() {
WhitelistUserAgentRegexes = append(WhitelistUserAgentRegexes, regexp.MustCompile(r)) WhitelistUserAgentRegexes = append(WhitelistUserAgentRegexes, regexp.MustCompile(r))
} }
ExplicitlyAllowedPaths = []*regexp.Regexp{}
for _, pathPattern := range config.AllowedPaths {
ExplicitlyAllowedPaths = append(ExplicitlyAllowedPaths, regexp.MustCompile(pathPattern))
}
}
func lookupHostname(hostname string) string {
dnsCacheTTL := time.Second * 5 dnsCacheTTL := time.Second * 5
lastDnsResolution := time.Unix(0, 0) lastDnsResolution := time.Unix(0, 0)
backendIPAddreses := []net.IP{} backendIPAddreses := []net.IP{}
lookupHostname := func(hostname string) string {
if time.Since(lastDnsResolution) > dnsCacheTTL { if time.Since(lastDnsResolution) > dnsCacheTTL {
var err error var err error
backendIPAddreses, err = net.LookupIP(hostname) backendIPAddreses, err = net.LookupIP(hostname)
if err != nil { checkErrWarn(err, fmt.Sprintf("DNS resolution of hostname (%s) failed!! error: %s", hostname, err), 1)
log.Printf("DNS resolution of hostname (%s) failed!! error: %s", hostname, err)
}
lastDnsResolution = time.Now() lastDnsResolution = time.Now()
} }
if backendIPAddreses != nil && len(backendIPAddreses) != 0 { if backendIPAddreses != nil && len(backendIPAddreses) != 0 {
return backendIPAddreses[rand.Intn(len(backendIPAddreses))].String() return backendIPAddreses[rand.Intn(len(backendIPAddreses))].String()
} }
return "" return ""
} }
reverseProxyInstance := &httputil.ReverseProxy{ func reverseProxyDirectorFunction(request *http.Request) {
Director: func(request *http.Request) {
ip := lookupHostname(config.UpstreamHostname) ip := lookupHostname(config.UpstreamHostname)
address := fmt.Sprintf("%s:%d", ip, config.UpstreamPort) address := fmt.Sprintf("%s:%d", ip, config.UpstreamPort)
request.Host = address request.Host = address
request.URL.Scheme = "http" request.URL.Scheme = "http"
request.URL.Host = address request.URL.Host = address
log.Printf("connecting to %s...\n", address) verbosityLevelLogPrint(fmt.Sprintf("connecting to %s...\n", address), 1)
}
// symys to forest: I put this handler into its own named function,
// mainly because a work of art deserves a title
func whackyHackyHandlerFunc(responseWriter http.ResponseWriter, request *http.Request) {
verbosityLevelLogPrint("whackyHackyHandlerFunc called", 5)
reverseProxyInstance := httputil.ReverseProxy{
Director: reverseProxyDirectorFunction,
Transport: &ResponseCodeReportingTransport{
UnderlyingTransport: http.DefaultTransport,
}, },
Transport: http.DefaultTransport,
} }
//robotsTxtDisallow := []regexp.Regexp{}
robotsTxtDisallow := getRobotsDotTxt()
http.HandleFunc("/", func(responseWriter http.ResponseWriter, request *http.Request) {
remoteIp := getRemoteIP(request) remoteIp := getRemoteIP(request)
userAgent := request.Header.Get("User-Agent") userAgent := request.Header.Get("User-Agent")
verbosityLevelLogPrint(fmt.Sprintf("userAgnt from whackyHackyHandlerFunc: %v", userAgent), 5)
isBotResult := isbot.Bot(request) isBotResult := isbot.Bot(request)
if isbot.Is(isBotResult) { if isbot.Is(isBotResult) {
for _, r := range WhitelistUserAgentRegexes { for _, r := range WhitelistUserAgentRegexes {
@ -152,13 +232,46 @@ func main() {
botsAllowed := true botsAllowed := true
for _, r := range robotsTxtDisallow { for _, r := range robotsTxtDisallow {
thisRegexpMsg := fmt.Sprintf("Checking URL pattern: %v against regexp: %v", request.URL.Path, r)
verbosityLevelLogPrint(thisRegexpMsg, 5)
if r.MatchString(request.URL.Path) { if r.MatchString(request.URL.Path) {
verbosityLevelLogPrint(fmt.Sprintf("Request matched disallowed pattern: %v", r), 5)
botsAllowed = false botsAllowed = false
break break
} }
} }
blockedTime, count := incrementTrafficDB(remoteIp, request.URL.Path, isbot.Is(isBotResult), botsAllowed) botsException := false
for _, r := range ExplicitlyAllowedPaths {
thisRegexpMsg := fmt.Sprintf("Checking URL pattern: %v against regexp: %v", request.URL.Path, r)
verbosityLevelLogPrint(thisRegexpMsg, 5)
if r.MatchString(request.URL.Path) {
verbosityLevelLogPrint(fmt.Sprintf("Request matched allowed pattern: %v", r), 5)
botsException = true
break
}
}
blockedTime, count, sleepTimeNeeded, allowedEverywhere := incrementTrafficDB(remoteIp, request.URL.Path, isbot.Is(isBotResult), botsAllowed)
if allowedEverywhere {
verbosityLevelLogPrint(fmt.Sprintf("%v is allowed everywhere as it was previously served a 303.", remoteIp), 5)
reverseProxyInstance.ServeHTTP(responseWriter, request)
return
}
if botsException || botsAllowed || isbot.IsNot(isBotResult) {
sleepyTime := time.Duration(sleepTimeNeeded*1000) * time.Millisecond
if sleepyTime > 0 {
verbosityLevelLogPrint(fmt.Sprintf("Need to sleep for: %v , to comply with rate limit", sleepyTime), 5)
time.Sleep(sleepyTime)
} else {
verbosityLevelLogPrint("Request looks fine, but not logged in, so subject to rate limit. It's staying within the rate limit on its own.", 4)
}
reverseProxyInstance.ServeHTTP(responseWriter, request)
return
}
hoursInAMonth := float64(24 * 30) hoursInAMonth := float64(24 * 30)
blocked := time.Since(blockedTime).Hours() < hoursInAMonth blocked := time.Since(blockedTime).Hours() < hoursInAMonth
@ -168,16 +281,20 @@ func main() {
) )
if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" { if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" {
tarpitLogMsg := fmt.Sprintf("Tarpit activated for %v", remoteIp)
verbosityLevelLogPrint(tarpitLogMsg, 2)
bible, err := os.Open("bible.txt") bible, err := os.Open("bible.txt")
if err != nil { if err != nil {
http.Error(responseWriter, `1:0 Welcome to our bad bot behaviour management tarpit. http.Error(responseWriter, `1:0 Welcome to our bad bot behaviour management tarpit.
You are here because you requested over 101 "below-surface-level" pages You are here because you requested over 101 "below-surface-level" pages
in under 16 minutes. Your IP address will be blocked for 1 month. in under 16 minutes. Your IP address will be blocked for 1 month.
If you wish to have it unblocked, please email ops@cyberia.club`, http.StatusForbidden) If you wish to have it unblocked, please email ops@cyberia.club`, http.StatusTeapot)
return return
} }
responseWriter.Header().Set("Content-Type", "text/html") responseWriter.Header().Set("Content-Type", "text/html")
//Uncomment to give the bots a little heads-up
//responseWriter.WriteHeader(http.StatusTeapot)
fmt.Fprint(responseWriter, `<!DOCTYPE HTML> fmt.Fprint(responseWriter, `<!DOCTYPE HTML>
<html lang="en"> <html lang="en">
@ -211,34 +328,26 @@ func main() {
return return
} }
// if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" { if !botsAllowed && isbot.Is(isBotResult) {
// http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
// return
// }
if config.HardBlockBasedOnRobotsTxt && !botsAllowed && isbot.Is(isBotResult) {
http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden) http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
return return
} }
if botsAllowed || isbot.IsNot(isBotResult) {
reverseProxyInstance.ServeHTTP(responseWriter, request)
}
})
log.Printf(
"im about to try to listen on port %d and forward to http://%s:%d!\n",
config.ListenPort, config.UpstreamHostname, config.UpstreamPort,
)
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", config.ListenPort), nil))
} }
func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (time.Time, int64) { type LogUpdate struct {
ForIP string
AllowSignal bool
RequestPath string
BotsAllowed bool
SleepTimeNeededReturn float64
BlockedReturn time.Time
RecentRobotsViolationsReturn int64
AllowedEverywhereReturn bool
}
toReturn := time.Time{} func (logUpdateIn *LogUpdate) UpdateTheLog(tx *bolt.Tx) error {
toReturn2 := int64(0) verbosityLevelLogPrint("calling db.Update", 5)
err := db.Update(func(tx *bolt.Tx) error {
bucket, err := tx.CreateBucketIfNotExists([]byte("ip")) bucket, err := tx.CreateBucketIfNotExists([]byte("ip"))
if err != nil { if err != nil {
return err return err
@ -248,14 +357,14 @@ func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (
return err return err
} }
ipKey := []byte(fmt.Sprintf("%s", remoteIp)) ipKey := []byte(fmt.Sprintf("%s", logUpdateIn.ForIP))
ipBytes := bucket.Get(ipKey) ipBytes := bucket.Get(ipKey)
ip := RequestsLog{} ip := RequestsLog{}
if ipBytes != nil { if ipBytes != nil {
json.Unmarshal(ipBytes, &ip) json.Unmarshal(ipBytes, &ip)
} }
ipPathKey := []byte(fmt.Sprintf("%s_%s", remoteIp, requestPath)) ipPathKey := []byte(fmt.Sprintf("%s_%s", logUpdateIn.ForIP, logUpdateIn.RequestPath))
ipPathBytes := ipPathBucket.Get(ipPathKey) ipPathBytes := ipPathBucket.Get(ipPathKey)
totalVisits := uint64(0) totalVisits := uint64(0)
if ipPathBytes != nil { if ipPathBytes != nil {
@ -266,21 +375,35 @@ func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (
secondsSinceLastRequest := time.Since(ip.TimeOfLastRequest).Seconds() secondsSinceLastRequest := time.Since(ip.TimeOfLastRequest).Seconds()
ip.TimeOfLastRequest = time.Now() ip.TimeOfLastRequest = time.Now()
if secondsSinceLastRequest > ip.RollingAvgRequestRate { if secondsSinceLastRequest > ip.RollingAvgRequestRate {
ip.TimeOfFirstRequest = time.Now()
ip.NumAveragedRequests = 1
ip.RollingAvgRequestRate = 0 ip.RollingAvgRequestRate = 0
ip.RecentRobotsTxtViolations = 0 ip.RecentRobotsTxtViolations = 0
} else { } else {
ip.RollingAvgRequestRate -= secondsSinceLastRequest ip.NumAveragedRequests += 1
ip.RollingAvgRequestRate = float64(ip.NumAveragedRequests) / (time.Since(ip.TimeOfFirstRequest).Seconds())
} }
logUpdateIn.SleepTimeNeededReturn = 0 //0.01*float64(runtime.NumGoroutine())*float64(ip.NumAveragedRequests)/GLOBAL_RATE_LIMIT - time.Since(ip.TimeOfFirstRequest).Seconds()
ip.TotalRequests += 1 ip.TotalRequests += 1
if botsAllowed == false { if logUpdateIn.BotsAllowed == false {
ip.RecentRobotsTxtViolations += 1 ip.RecentRobotsTxtViolations += 1
} }
ip.RollingAvgRequestRate += float64(1) ip.RollingAvgRequestRate += float64(1)
if ip.RecentRobotsTxtViolations > int64(config.TarpitAfterRecentRobotsTxtViolations) && ip.RollingAvgRequestRate > config.TarpitAfterRobotsTxtViolationsPerSecond { if ip.RecentRobotsTxtViolations > int64(config.TarpitAfterRecentRobotsTxtViolations) && ip.RollingAvgRequestRate > config.TarpitAfterRobotsTxtViolationsPerSecond {
ip.Blocked = time.Now() ip.Blocked = time.Now()
} }
toReturn = ip.Blocked rollingAvgMsg := fmt.Sprintf("ip.RollingAvgRequestRate within incrementTrafficDB: %v", ip.RollingAvgRequestRate)
toReturn2 = ip.RecentRobotsTxtViolations verbosityLevelLogPrint(rollingAvgMsg, 5)
robotsViolationsMsg := fmt.Sprintf("ip.RecentRobotsTxtViolations within incrementTrafficDB: %v", ip.RecentRobotsTxtViolations)
verbosityLevelLogPrint(robotsViolationsMsg, 5)
logUpdateIn.BlockedReturn = ip.Blocked
logUpdateIn.RecentRobotsViolationsReturn = ip.RecentRobotsTxtViolations
if logUpdateIn.AllowSignal {
ip.AllowEverywhere = true
}
logUpdateIn.AllowedEverywhereReturn = ip.AllowEverywhere
buffer := make([]byte, 8) buffer := make([]byte, 8)
binary.LittleEndian.PutUint64(buffer, totalVisits) binary.LittleEndian.PutUint64(buffer, totalVisits)
@ -296,15 +419,59 @@ func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (
} }
return ipPathBucket.Put(ipPathKey, buffer) return ipPathBucket.Put(ipPathKey, buffer)
}) }
func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (blockedReturn time.Time, recentRobotsViolationsReturn int64, sleepTimeNeededReturn float64, allowedEverywhereReturn bool) {
recentRobotsViolationsReturn = int64(0)
preErrMsg := fmt.Sprintf("incrementTrafficDB called with params (%v, %v, %v, %v)", remoteIp, requestPath, isBot, botsAllowed)
verbosityLevelLogPrint(preErrMsg, 5)
select {
case ipSignaledAllow := <-AllowIPChan:
verbosityLevelLogPrint("Consumed AllowIPChan value", 5)
allowedLogUpdate := LogUpdate{
ForIP: ipSignaledAllow,
AllowSignal: true,
RequestPath: "/",
BotsAllowed: true,
}
allowedLogMsg := fmt.Sprintf("allowedLogUpdate in ipSignaledAllow case, before update: %v", allowedLogUpdate)
verbosityLevelLogPrint(allowedLogMsg, 5)
err := db.Update(allowedLogUpdate.UpdateTheLog)
if err != nil { if err != nil {
log.Printf("boltdb error: %v\n", err) log.Printf("boltdb error: %v\n", err)
} }
postUpdateMsg := fmt.Sprintf("allowedLogUpdate in ipSignaledAllow case, after update: %v", allowedLogUpdate)
verbosityLevelLogPrint(postUpdateMsg, 5)
default:
verbosityLevelLogPrint("Default case: no AllowIPChan value to consume", 5)
return toReturn, toReturn2 }
thisLogUpdate := LogUpdate{
ForIP: remoteIp,
AllowSignal: false,
RequestPath: requestPath,
BotsAllowed: botsAllowed,
}
err := db.Update(thisLogUpdate.UpdateTheLog)
if err != nil {
log.Printf("boltdb error: %v\n", err)
} else {
sleepTimeNeededReturn = thisLogUpdate.SleepTimeNeededReturn
blockedReturn = thisLogUpdate.BlockedReturn
recentRobotsViolationsReturn = thisLogUpdate.RecentRobotsViolationsReturn
allowedEverywhereReturn = thisLogUpdate.AllowedEverywhereReturn
}
incrementTrafficReturnDebugMsg := fmt.Sprintf("allowedEveryWhereReturn just before returning incrementTrafficDB: %v", allowedEverywhereReturn)
verbosityLevelLogPrint(incrementTrafficReturnDebugMsg, 5)
return
} }
func getRobotsDotTxt() []*regexp.Regexp { func getRobotsDotTxt() (robotsDisallowed []*regexp.Regexp) {
robotsTxtResponse, err := http.Get(fmt.Sprintf("http://%s:%d/robots.txt", config.UpstreamHostname, config.UpstreamPort)) robotsTxtResponse, err := http.Get(fmt.Sprintf("http://%s:%d/robots.txt", config.UpstreamHostname, config.UpstreamPort))
if err != nil { if err != nil {
log.Fatalf( log.Fatalf(
@ -328,7 +495,7 @@ func getRobotsDotTxt() []*regexp.Regexp {
txtStr := string(txtBytes) txtStr := string(txtBytes)
txtLines := strings.Split(txtStr, "\n") txtLines := strings.Split(txtStr, "\n")
robotsTxtDisallow := []*regexp.Regexp{} robotsDisallowed = []*regexp.Regexp{}
for _, line := range txtLines { for _, line := range txtLines {
line = strings.ToLower(line) line = strings.ToLower(line)
if strings.HasPrefix(line, "disallow:") { if strings.HasPrefix(line, "disallow:") {
@ -337,17 +504,20 @@ func getRobotsDotTxt() []*regexp.Regexp {
expression = strings.ReplaceAll(expression, "*", ".*") expression = strings.ReplaceAll(expression, "*", ".*")
expression = "^" + expression + ".*" expression = "^" + expression + ".*"
log.Printf("robotsTxtDisallow: %s\n", expression) log.Printf("robotsDisallowed: %s\n", expression)
robotsTxtDisallow = append(robotsTxtDisallow, regexp.MustCompile(expression)) robotsDisallowed = append(robotsDisallowed, regexp.MustCompile(expression))
} }
} }
return robotsTxtDisallow return
} }
func getRemoteIP(request *http.Request) string { func getRemoteIP(request *http.Request) string {
toReturn := request.Header.Get("X-Forwarded-For") toReturn := request.Header.Get("X-Forwarded-For")
if toReturn == "" { if toReturn == "" {
toReturn = strings.Split(request.RemoteAddr, ":")[0] //LastIndex is needed rather than strings.Split
//so this works on IPv6 addresses
portColonInd := strings.LastIndex(request.RemoteAddr, ":")
toReturn = request.RemoteAddr[:portColonInd]
} }
return toReturn return toReturn
} }
@ -356,6 +526,7 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
for i, prefix := range config.BlockedSourceIpPrefixes { for i, prefix := range config.BlockedSourceIpPrefixes {
if strings.HasPrefix(remoteIP, prefix.Prefix) { if strings.HasPrefix(remoteIP, prefix.Prefix) {
verbosityLevelLogPrint("Judged to be bot in myIsBot based on BlockedSourceIpPrefixes", 5)
return isbot.Result(MyBlockedSourceIpPrefixIsBotReasonOffset + i) return isbot.Result(MyBlockedSourceIpPrefixIsBotReasonOffset + i)
} }
} }
@ -363,6 +534,7 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
for _, r := range BlockedUserAgentRegexes { for _, r := range BlockedUserAgentRegexes {
if r.MatchString(userAgent) { if r.MatchString(userAgent) {
verbosityLevelLogPrint("Judged to be bot based in myIsBot based on BlockedUserAgentRegexes", 5)
return isbot.BotClientLibrary return isbot.BotClientLibrary
} }
} }
@ -371,9 +543,11 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
func getIsBotReason(code isbot.Result) string { func getIsBotReason(code isbot.Result) string {
thisCodeAsUint8 := uint8(code)
myReasonMap := map[uint8]string{ myReasonMap := map[uint8]string{
0: "Known to not be a bot", 0: "Known to not be a bot",
1: "None of the rules matches, so probably not a bot", 1: "None of the rules matches, so we assume non-bot",
2: "Prefetch algorithm", 2: "Prefetch algorithm",
3: "User-Agent appeared to contain a URL", 3: "User-Agent appeared to contain a URL",
4: "Known client library", 4: "Known client library",
@ -395,5 +569,22 @@ func getIsBotReason(code isbot.Result) string {
myReasonMap[uint8(MyBlockedSourceIpPrefixIsBotReasonOffset+i)] = prefix.ShortDesc myReasonMap[uint8(MyBlockedSourceIpPrefixIsBotReasonOffset+i)] = prefix.ShortDesc
} }
return myReasonMap[uint8(code)] verbosityLevelLogPrint(fmt.Sprintf("reason fount by getIsBotReason: %v", myReasonMap[thisCodeAsUint8]), 5)
return myReasonMap[thisCodeAsUint8]
}
// A couple of helper functions, mainly for debugging
func checkErrWarn(err error, msg string, verbosityLevel int) {
if err != nil {
if verbosityLevel <= GLOBAL_VERBOSITY {
log.Printf("%s: %v\n", msg, err)
}
}
}
func verbosityLevelLogPrint(msg string, verbosityLevel int) {
if verbosityLevel <= GLOBAL_VERBOSITY {
log.Printf("%s\n", msg)
}
} }

43
mainDiagram.mermaid Normal file
View file

@ -0,0 +1,43 @@
stateDiagram-v2
direction TB
accTitle: crawler-blocker Diagram
accDescr: Diagram of main.go
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
class nixShellEnvironment nixShell
class setConfigSubroutine, nixShellExitSubroutine subroutine
class mainFunction systemShell
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
preEntryState --> mainFunction
setConfigSubroutine --> dbStatSubroutine
callToConfigSelf --> handleFuncCall
mainFunction --> postExitState
state "Define types. For convenience and clarity." as preEntryState
state "Define global variables and constants. These make it easier to pass data between some of the other functions below." as preEntryState
state "entrypoint to main" as preEntryState
state "main exits and control is returned to the calling environment" as postExitState
state "main" as mainFunction {
state "configureSelf() is called" as callToConfigSelf {
state "Configuration Subroutine" as setConfigSubroutine
state "Import config from file, panic if error occurs" as setConfigSubroutine
state "Check that config parameters are within acceptable values. Exit if not." as setConfigSubroutine
state "DataBase Stat and Initialization Subroutine" as dbStatSubroutine
state "stat ./data directory. Create it if it doesn't exist." as dbStatSubroutine
}
state "call to http.HandlerFunc" as handleFuncCall
state "Register whackyHackyHandlerFunc with pattern / to the DefaultServMux." as handleFuncCall
state "The / pattern basically matches everything from the root of the URL path." as handleFuncCall
state "So here, all incoming requests should be handled by this function." as handleFuncCall
}

BIN
pkgnamezzz Executable file

Binary file not shown.

214
pkgnamezzz_test.go Normal file
View file

@ -0,0 +1,214 @@
package main
import (
"fmt"
"log"
"net/http"
"testing"
"time"
)
type EditableUserAgentTransport struct {
CustomUserAgent string
UnderlyingTransport http.RoundTripper
}
func NewEditableUserAgentTransport(desiredUserAgent string) EditableUserAgentTransport {
transportOut := EditableUserAgentTransport{
CustomUserAgent: desiredUserAgent,
UnderlyingTransport: http.DefaultTransport,
}
return transportOut
}
func (t *EditableUserAgentTransport) RoundTrip(request *http.Request) (*http.Response, error) {
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
request.Header["User-Agent"] = []string{t.CustomUserAgent}
response, err := t.UnderlyingTransport.RoundTrip(request)
if err != nil {
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
return nil, err
}
return response, nil
}
/*
func TestGetRobotsDotTxt(t *testing.T) {
fmt.Println("Running test TestGetRobotsDotTxt")
//It's assumed configureSelf() will take longer than setting up
//the server in runSimpleServer(), so no additional logic has been
//added to ensure the server is running before attempting to use it
//Should be fine, as it errs on the side of failure anyway.
go runSimpleServer(":3000", "TestGetRobotsDotTxt")
configureSelf()
thisRobots := getRobotsDotTxt()
robotsMsg := fmt.Sprintf("robots.txt: %v\n", thisRobots)
log.Printf(robotsMsg)
thisShouldMatch := "http://localhost:3002/respond200"
thisShouldNotMatch := "https://localhost:3002/respond303"
doesActuallyMatch := false
shouldNotMatchButDoes := false
for _, r := range thisRobots {
if r.MatchString(thisShouldMatch) {
doesActuallyMatch = true
}
if r.MatchString(thisShouldNotMatch) {
shouldNotMatchButDoes = true
}
}
if !doesActuallyMatch {
t.Errorf("Pattern %v should match robots regexp but does not.", thisShouldMatch)
}
if shouldNotMatchButDoes {
t.Errorf("Pattern %v should NOT match robots regexp, but it does.", thisShouldNotMatch)
}
}
*/
type ReqBoxForTest struct {
TestURL string
StatusWant int
CustomClient http.Client
SleepTime time.Duration
}
func (rb ReqBoxForTest) DoRequestStuff(t *testing.T) (measuredFetchTime time.Duration) {
testURL := rb.TestURL
testURLMsg := fmt.Sprintf("testURL: %v", testURL)
verbosityLevelLogPrint(testURLMsg, 5)
thisClient := rb.CustomClient
timeBeforeGet := time.Now()
thisResponse, err := thisClient.Get(testURL)
measuredFetchTime = time.Since(timeBeforeGet)
statusWant := rb.StatusWant
statusGot := thisResponse.StatusCode
if err != nil {
t.Errorf("Error getting %s: %v", testURL, err)
} else if statusWant != statusGot {
t.Errorf("Test should fail: Should have gotten status %v but got %v!", statusWant, statusGot)
}
fmt.Printf("thisResponse: %v\n", thisResponse)
time.Sleep(rb.SleepTime)
return
}
func TestMain(t *testing.T) {
GLOBAL_VERBOSITY = 5
fmt.Println("Running test TestMain")
go runSimpleServer(":3000", "TestMain")
commonRealUserAgent := "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
thisTransport := NewEditableUserAgentTransport(commonRealUserAgent)
thisClient := http.Client{Transport: &thisTransport}
go main()
//Sleep while main sets up the reverse proxy
//Inelegant, but errs on the side of failure
//so its okay
time.Sleep(1 * time.Second)
testCaseNormalUserLike := ReqBoxForTest{
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
testCaseNormalUserLike.DoRequestStuff(t)
testCaseMaliciousUser := ReqBoxForTest{
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
numRequestsDone := 0
testDuration := time.Duration(0)
for i := 0; i < 20; i++ {
testDuration += testCaseMaliciousUser.DoRequestStuff(t)
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
verbosityLevelLogPrint(durationMsg, 5)
numRequestsDone++
}
testRate := float64(numRequestsDone) / testDuration.Seconds()
totalDurationMsg := fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
verbosityLevelLogPrint(totalDurationMsg, 5)
if testRate > GLOBAL_RATE_LIMIT+1 {
t.Errorf("MaliciousUser test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
}
sillyFakeUserAgent := "I am 3vil and undetectable 8D"
thisTransport = NewEditableUserAgentTransport(sillyFakeUserAgent)
thisClient = http.Client{Transport: &thisTransport}
maxNumRobotsViolations := config.TarpitAfterRecentRobotsTxtViolations
desiredRateLimit := config.TarpitAfterRobotsTxtViolationsPerSecond
neededSleepTime := 1000.0 / (desiredRateLimit + 1.0)
rateLimitMsg := fmt.Sprintf("desiredRateLimit: %v, so sleep time should be %v", desiredRateLimit, neededSleepTime)
verbosityLevelLogPrint(rateLimitMsg, 5)
testCaseDumbBotLike := ReqBoxForTest{
TestURL: "http://localhost:3002/respond200",
StatusWant: 403,
CustomClient: thisClient,
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
}
for reqIndex := 0; reqIndex < maxNumRobotsViolations-21; reqIndex++ {
testCaseDumbBotLike.DoRequestStuff(t)
}
testCaseDumbBotPassesThreshold := ReqBoxForTest{
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
}
testCaseDumbBotPassesThreshold.DoRequestStuff(t)
testCasePushRateLimit := ReqBoxForTest{
TestURL: "http://localhost:3002/respond200robotsallowed",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
numRequestsDone = 0
testDuration = time.Duration(0)
for i := 0; i < 20; i++ {
testDuration += testCasePushRateLimit.DoRequestStuff(t)
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
verbosityLevelLogPrint(durationMsg, 5)
numRequestsDone++
}
testRate = float64(numRequestsDone) / testDuration.Seconds()
totalDurationMsg = fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
verbosityLevelLogPrint(totalDurationMsg, 5)
if testRate > GLOBAL_RATE_LIMIT+1 {
t.Errorf("Test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
}
testCaseUserLoggingIn := ReqBoxForTest{
TestURL: "http://localhost:3002/respond303",
StatusWant: 303,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
testCaseUserLoggingIn.DoRequestStuff(t)
testCaseUserAfterLoggedIn := ReqBoxForTest{
TestURL: "http://localhost:3002/respond200",
StatusWant: 200,
CustomClient: thisClient,
SleepTime: 0 * time.Second,
}
testCaseUserAfterLoggedIn.DoRequestStuff(t)
}

View file

@ -0,0 +1,16 @@
requestRate = requests/second
avgRequestRate = numberOfRequests/TotalTimeLoggingRequests
to be within limit, need:
avgRequestRate <= globalLimit
or:
numRequests/TotalTime <= globalLimit
numRequests/globalLimit <= TotalTime
We can control it with sleep time, so
numRequests/globalLimit <= TotalTime + SleepTime
so
numRequests/globalLimit - TotalTime <= SleepTime
We'll go with equal here, but we could also do a randrange with the left hand side as the minimum.

70
simpleServer.go Normal file
View file

@ -0,0 +1,70 @@
package main
import (
"fmt"
"io/ioutil"
"log"
"net/http"
)
func rootHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(http.StatusOK)
data := "Any page other than /respond303 or /respond200 should end up here"
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
fmt.Fprint(w, string(data))
}
func robotsHandler(w http.ResponseWriter, r *http.Request) {
fmt.Printf("robotsHandler called\n")
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(http.StatusOK)
data, err := ioutil.ReadFile("testRobots.txt")
if err != nil {
panic(err)
}
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
fmt.Fprint(w, string(data))
}
func respond303Handler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(303)
data := "You should get a 303 status!"
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
fmt.Fprint(w, string(data))
}
func respond200Handler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(200)
data := "You should get a 200 status!"
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
fmt.Fprint(w, string(data))
}
type customHandlerToAvoidDefaultServeMux struct {
NondefaultServeMux *http.ServeMux
}
func (customHandler customHandlerToAvoidDefaultServeMux) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
verbosityLevelLogPrint(fmt.Sprintf("call to customHandlerToAvoidDefaultServeMux with request URL: %v", req.URL), 5)
customHandler.NondefaultServeMux.ServeHTTP(rw, req)
}
func runSimpleServer(portIn string, callingFunc string) {
GLOBAL_VERBOSITY = 5
//Need to define a new ServeMux to disambiguate from the main() server
testServeMux := http.NewServeMux()
testHandler := customHandlerToAvoidDefaultServeMux{NondefaultServeMux: testServeMux}
testHandler.NondefaultServeMux.HandleFunc("/", rootHandler)
testHandler.NondefaultServeMux.HandleFunc("/respond303", respond303Handler)
testHandler.NondefaultServeMux.HandleFunc("/respond200", respond200Handler)
testHandler.NondefaultServeMux.HandleFunc("/respond200RobotsAllowed", respond200Handler)
testHandler.NondefaultServeMux.HandleFunc("/robots.txt", robotsHandler)
verbosityLevelLogPrint(fmt.Sprintf("runSimpleServer called by %v\n", callingFunc), 5)
verbosityLevelLogPrint(fmt.Sprintf("About to ListenAndServe with params ( %v , %v )", portIn, testHandler), 5)
log.Fatal(http.ListenAndServe(portIn, testHandler))
}

4
testRobots.txt Normal file
View file

@ -0,0 +1,4 @@
disallow: https://localhost:8080
disallow: http://localhost:3002/respond200
disallow: http://localhost:3000/respond200
disallow: /respond200

3382
testlog Normal file

File diff suppressed because it is too large Load diff

2899
testlog.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,31 @@
stateDiagram-v2
direction TB
accTitle: crawler-blocker Diagram
accDescr: Diagram of main.go
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
class nixShellEnvironment nixShell
class setConfigSubroutine, nixShellExitSubroutine subroutine
class mainFunction systemShell
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
preEntryState --> mainFunction
setConfigSubroutine --> setConfigSubroutine
state "DefaultServeMux (I think) listens on ports 443 and 80, or something" as preEntryState
state "HTTP request on 443 triggers entrypoint to whackyHackyHandlerFunc" as preEntryState
state "whackyHackyHandlerFunc" as whhf {
state "set reverseProxyInstance, of type ModifiedReverseProxy." as instantiateReverseProxy
state "This type composes ('ineherits' , but not technically somehow) the ReverseProxy type from the stdlib." as instantiateReverseProxy
}
state "return from whackyHackyHandlerFunc, probably killing a goroutine that was spawned by DefaultServeMux." as return