symys-add-loggedin-dontblock #1
17 changed files with 8898 additions and 99857 deletions
16
ReadMe.md
16
ReadMe.md
|
@ -1,13 +1,23 @@
|
||||||
# forgejo-crawler-blocker
|
# forgejo-crawler-blocker
|
||||||
|
|
||||||
|
What does a GPT training web-crawler see when it tries to access our forgejo instance and look at every single file at every single commit, ignoring `robots.txt` and sending a generic user-agent header?
|
||||||
|
|
||||||
|
What can
|
||||||
|
I say except
|
||||||
|
Hail Eris,
|
||||||
|
All Hail Discordia!
|
||||||
|
|
||||||
|
|
||||||
|
# maintenance
|
||||||
|
|
||||||
if anyone needs to clear the data to unblock someone, these are the commands to run on paimon:
|
if anyone needs to clear the data to unblock someone, these are the commands to run on paimon:
|
||||||
|
|
||||||
`sudo -i`
|
`sudo -i`
|
||||||
|
|
||||||
```
|
```
|
||||||
docker stop gitea_forgejo-crawler-blocker_1
|
docker stop gitea_forgejo-crawler-blocker
|
||||||
rm /etc/docker-compose/gitea/forgejo-crawler-blocker/traffic.db
|
rm /etc/docker-compose/gitea/forgejo-crawler-blocker/traffic.db
|
||||||
docker start gitea_forgejo-crawler-blocker_1
|
docker start gitea_forgejo-crawler-blocker
|
||||||
```
|
```
|
||||||
|
|
||||||
# persistent data storage
|
# persistent data storage
|
||||||
|
@ -21,4 +31,4 @@ Run on server:
|
||||||
(paimon)
|
(paimon)
|
||||||
```
|
```
|
||||||
cd /home/forest/forgejo-crawler-blocker && git pull sequentialread main && cd /etc/docker-compose/gitea && docker stop gitea_forgejo-crawler-blocker_1 || true && docker rm gitea_forgejo-crawler-blocker_1 || true && docker image rm gitea_forgejo-crawler-blocker || true && rm -f forgejo-crawler-blocker/traffic.db && docker-compose up -d && sleep 1 && docker logs -n 1000 -f gitea_forgejo-crawler-blocker_1
|
cd /home/forest/forgejo-crawler-blocker && git pull sequentialread main && cd /etc/docker-compose/gitea && docker stop gitea_forgejo-crawler-blocker_1 || true && docker rm gitea_forgejo-crawler-blocker_1 || true && docker image rm gitea_forgejo-crawler-blocker || true && rm -f forgejo-crawler-blocker/traffic.db && docker-compose up -d && sleep 1 && docker logs -n 1000 -f gitea_forgejo-crawler-blocker_1
|
||||||
```
|
```
|
||||||
|
|
11
config.json
11
config.json
|
@ -1,15 +1,20 @@
|
||||||
{
|
{
|
||||||
"UpstreamHostname": "gitea",
|
"UpstreamHostname": "localhost",
|
||||||
"UpstreamPort": 3000,
|
"UpstreamPort": 3000,
|
||||||
"ListenPort": 3002,
|
"ListenPort": 3002,
|
||||||
"TarpitAfterRecentRobotsTxtViolations": 100,
|
"TarpitAfterRecentRobotsTxtViolations": 100,
|
||||||
"TarpitAfterRobotsTxtViolationsPerSecond": 0.1,
|
"TarpitAfterRobotsTxtViolationsPerSecond": 20.0,
|
||||||
|
"GlobalRateLimit": 5.0,
|
||||||
|
"AllowedPaths": [
|
||||||
|
"/respond303",
|
||||||
|
"respond200RobotsAllowed"
|
||||||
|
],
|
||||||
|
"GlobalVerbosityLevel":5,
|
||||||
"BlockedUserAgentRegexes": [
|
"BlockedUserAgentRegexes": [
|
||||||
"Python/[^ ]+ aiohttp/.*"
|
"Python/[^ ]+ aiohttp/.*"
|
||||||
],
|
],
|
||||||
"WhitelistUserAgentRegexes": [
|
"WhitelistUserAgentRegexes": [
|
||||||
"^git/[0-9\\.]*$",
|
"^git/[0-9\\.]*$",
|
||||||
"^JGit/[0-9a-z\\.]*$",
|
|
||||||
"https://github.com/matrix-org/synapse"
|
"https://github.com/matrix-org/synapse"
|
||||||
],
|
],
|
||||||
"BlockedSourceIpPrefixes": [
|
"BlockedSourceIpPrefixes": [
|
||||||
|
|
198
crawlpit_test.go
Normal file
198
crawlpit_test.go
Normal file
|
@ -0,0 +1,198 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type EditableUserAgentTransport struct {
|
||||||
|
CustomUserAgent string
|
||||||
|
UnderlyingTransport http.RoundTripper
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewEditableUserAgentTransport(desiredUserAgent string) EditableUserAgentTransport {
|
||||||
|
transportOut := EditableUserAgentTransport{
|
||||||
|
CustomUserAgent: desiredUserAgent,
|
||||||
|
UnderlyingTransport: http.DefaultTransport,
|
||||||
|
}
|
||||||
|
return transportOut
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *EditableUserAgentTransport) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
|
||||||
|
request.Header["User-Agent"] = []string{t.CustomUserAgent}
|
||||||
|
response, err := t.UnderlyingTransport.RoundTrip(request)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return response, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
func TestGetRobotsDotTxt(t *testing.T) {
|
||||||
|
fmt.Println("Running test TestGetRobotsDotTxt")
|
||||||
|
//It's assumed configureSelf() will take longer than setting up
|
||||||
|
//the server in runSimpleServer(), so no additional logic has been
|
||||||
|
//added to ensure the server is running before attempting to use it
|
||||||
|
//Should be fine, as it errs on the side of failure anyway.
|
||||||
|
go runSimpleServer(":3000", "TestGetRobotsDotTxt")
|
||||||
|
configureSelf()
|
||||||
|
|
||||||
|
thisRobots := getRobotsDotTxt()
|
||||||
|
robotsMsg := fmt.Sprintf("robots.txt: %v\n", thisRobots)
|
||||||
|
log.Printf(robotsMsg)
|
||||||
|
thisShouldMatch := "http://localhost:3002/respond200"
|
||||||
|
thisShouldNotMatch := "https://localhost:3002/respond303"
|
||||||
|
|
||||||
|
doesActuallyMatch := false
|
||||||
|
shouldNotMatchButDoes := false
|
||||||
|
for _, r := range thisRobots {
|
||||||
|
if r.MatchString(thisShouldMatch) {
|
||||||
|
doesActuallyMatch = true
|
||||||
|
}
|
||||||
|
if r.MatchString(thisShouldNotMatch) {
|
||||||
|
shouldNotMatchButDoes = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !doesActuallyMatch {
|
||||||
|
t.Errorf("Pattern %v should match robots regexp but does not.", thisShouldMatch)
|
||||||
|
}
|
||||||
|
if shouldNotMatchButDoes {
|
||||||
|
t.Errorf("Pattern %v should NOT match robots regexp, but it does.", thisShouldNotMatch)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
type TestCase struct {
|
||||||
|
Name string
|
||||||
|
TestURL string
|
||||||
|
StatusWant int
|
||||||
|
CustomClient http.Client
|
||||||
|
SleepTime time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rb TestCase) DoRequestStuff(t *testing.T) (measuredFetchTime time.Duration) {
|
||||||
|
|
||||||
|
testURL := rb.TestURL
|
||||||
|
testURLMsg := fmt.Sprintf("testURL: %v", testURL)
|
||||||
|
verbosityLevelLogPrint(testURLMsg, 5)
|
||||||
|
thisClient := rb.CustomClient
|
||||||
|
timeBeforeGet := time.Now()
|
||||||
|
thisResponse, err := thisClient.Get(testURL)
|
||||||
|
measuredFetchTime = time.Since(timeBeforeGet)
|
||||||
|
statusWant := rb.StatusWant
|
||||||
|
statusGot := thisResponse.StatusCode
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error getting %s: %v", testURL, err)
|
||||||
|
} else if statusWant != statusGot {
|
||||||
|
t.Errorf("Test `%v` should fail: Should have gotten status %v but got %v!", rb.Name, statusWant, statusGot)
|
||||||
|
}
|
||||||
|
fmt.Printf("thisResponse: %v\n", thisResponse)
|
||||||
|
time.Sleep(rb.SleepTime)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMain(t *testing.T) {
|
||||||
|
GLOBAL_VERBOSITY = 5
|
||||||
|
fmt.Println("Running test TestMain")
|
||||||
|
go runSimpleServer(":3000", "TestMain")
|
||||||
|
|
||||||
|
commonRealUserAgent := "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
|
||||||
|
thisTransport := NewEditableUserAgentTransport(commonRealUserAgent)
|
||||||
|
thisClient := http.Client{Transport: &thisTransport}
|
||||||
|
go main()
|
||||||
|
|
||||||
|
//Sleep while main sets up the reverse proxy
|
||||||
|
//Inelegant, but errs on the side of failure
|
||||||
|
//so its okay
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
|
testCaseNormalUserLike := TestCase{
|
||||||
|
Name: "Normal User test case",
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseNormalUserLike.DoRequestStuff(t)
|
||||||
|
|
||||||
|
testCaseMaliciousUser := TestCase{
|
||||||
|
Name: "Malicious user test case",
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
numRequestsDone := 0
|
||||||
|
testDuration := time.Duration(0)
|
||||||
|
for i := 0; i < 20; i++ {
|
||||||
|
testDuration += testCaseMaliciousUser.DoRequestStuff(t)
|
||||||
|
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
|
||||||
|
verbosityLevelLogPrint(durationMsg, 5)
|
||||||
|
numRequestsDone++
|
||||||
|
}
|
||||||
|
|
||||||
|
testRate := float64(numRequestsDone) / testDuration.Seconds()
|
||||||
|
totalDurationMsg := fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
|
||||||
|
verbosityLevelLogPrint(totalDurationMsg, 5)
|
||||||
|
if testRate > GLOBAL_RATE_LIMIT+1000 {
|
||||||
|
t.Errorf("MaliciousUser test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
|
||||||
|
}
|
||||||
|
|
||||||
|
sillyFakeUserAgent := "I am 3vil and undetectable 8D"
|
||||||
|
thisTransport = NewEditableUserAgentTransport(sillyFakeUserAgent)
|
||||||
|
thisClient = http.Client{Transport: &thisTransport}
|
||||||
|
|
||||||
|
maxNumRobotsViolations := config.TarpitAfterRecentRobotsTxtViolations
|
||||||
|
desiredRateLimit := config.TarpitAfterRobotsTxtViolationsPerSecond
|
||||||
|
neededSleepTime := 1000.0 / (desiredRateLimit + 1.0)
|
||||||
|
rateLimitMsg := fmt.Sprintf("desiredRateLimit: %v, so sleep time should be %v", desiredRateLimit, neededSleepTime)
|
||||||
|
verbosityLevelLogPrint(rateLimitMsg, 5)
|
||||||
|
|
||||||
|
testCaseRespectfulBotLike := TestCase{
|
||||||
|
Name: "Respectful bot test case",
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 403,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||||
|
}
|
||||||
|
for reqIndex := 0; reqIndex < maxNumRobotsViolations-21; reqIndex++ {
|
||||||
|
testCaseRespectfulBotLike.DoRequestStuff(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseRespectfulBotPassesThreshold := TestCase{
|
||||||
|
Name: "Test case which should be the first to pass the tarpit threshold.",
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||||
|
}
|
||||||
|
testCaseRespectfulBotPassesThreshold.DoRequestStuff(t)
|
||||||
|
|
||||||
|
testCaseUserLoggingIn := TestCase{
|
||||||
|
Name: "Test case case to check that a user can visit explicitly allowed page",
|
||||||
|
TestURL: "http://localhost:3002/respond303",
|
||||||
|
StatusWant: 303,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
testCaseUserLoggingIn.DoRequestStuff(t)
|
||||||
|
|
||||||
|
testCaseUserAfterLoggedIn := TestCase{
|
||||||
|
Name: "Test case to check for already logged-in user",
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseUserAfterLoggedIn.DoRequestStuff(t)
|
||||||
|
|
||||||
|
}
|
17
go.mod
17
go.mod
|
@ -1,11 +1,14 @@
|
||||||
module git.sequentialread.com/forest/forgejo-crawler-blocker
|
module crawlpit
|
||||||
|
|
||||||
go 1.19
|
go 1.21.6
|
||||||
|
|
||||||
require (
|
require (
|
||||||
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04 // indirect
|
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04
|
||||||
github.com/boltdb/bolt v1.3.1 // indirect
|
github.com/boltdb/bolt v1.3.1
|
||||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect
|
zgo.at/isbot v1.0.0
|
||||||
golang.org/x/sys v0.16.0 // indirect
|
)
|
||||||
zgo.at/isbot v1.0.0 // indirect
|
|
||||||
|
require (
|
||||||
|
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect
|
||||||
|
golang.org/x/sys v0.17.0 // indirect
|
||||||
)
|
)
|
||||||
|
|
4
go.sum
4
go.sum
|
@ -4,7 +4,7 @@ github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4=
|
||||||
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
|
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
|
||||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c h1:HelZ2kAFadG0La9d+4htN4HzQ68Bm2iM9qKMSMES6xg=
|
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c h1:HelZ2kAFadG0La9d+4htN4HzQ68Bm2iM9qKMSMES6xg=
|
||||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c/go.mod h1:JlzghshsemAMDGZLytTFY8C1JQxQPhnatWqNwUXjggo=
|
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c/go.mod h1:JlzghshsemAMDGZLytTFY8C1JQxQPhnatWqNwUXjggo=
|
||||||
golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU=
|
golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
|
||||||
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||||
zgo.at/isbot v1.0.0 h1:wM+40WO2LudR5352eCAQlHQdWsdNVpngB8Oe1qlTpdM=
|
zgo.at/isbot v1.0.0 h1:wM+40WO2LudR5352eCAQlHQdWsdNVpngB8Oe1qlTpdM=
|
||||||
zgo.at/isbot v1.0.0/go.mod h1:/w+LTWbHsV7G5fT41VnwM0kTdk1H/prynfMqaxcBWjA=
|
zgo.at/isbot v1.0.0/go.mod h1:/w+LTWbHsV7G5fT41VnwM0kTdk1H/prynfMqaxcBWjA=
|
||||||
|
|
44
logicalDiagram.mermaid
Normal file
44
logicalDiagram.mermaid
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
stateDiagram-v2
|
||||||
|
direction TB
|
||||||
|
|
||||||
|
accTitle: crawler-blocker Diagram
|
||||||
|
accDescr: Diagram of main.go
|
||||||
|
|
||||||
|
|
||||||
|
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
|
||||||
|
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
|
||||||
|
class nixShellEnvironment nixShell
|
||||||
|
class setConfigSubroutine, nixShellExitSubroutine subroutine
|
||||||
|
class mainFunction systemShell
|
||||||
|
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
|
||||||
|
|
||||||
|
preEntryState --> mainFunction
|
||||||
|
setConfigSubroutine --> setConfigSubroutine
|
||||||
|
setConfigSubroutine --> nixShellEnvironment
|
||||||
|
nixShellEnvironment --> nixShellExitSubroutine
|
||||||
|
mainFunction --> postExitState
|
||||||
|
|
||||||
|
state "Define types. For convenience and clarity." as preEntryState
|
||||||
|
state "Define global variables and constants. These make it easier to pass data between some of the other functions below." as preEntryState
|
||||||
|
state "entrypoint to main" as preEntryState
|
||||||
|
state "main exits and control is returned to the calling environment" as postExitState
|
||||||
|
state "main" as mainFunction {
|
||||||
|
|
||||||
|
state "configureSelf() is called" as callToConfigSelf {
|
||||||
|
|
||||||
|
state "Configuration Subroutine" as setConfigSubroutine
|
||||||
|
state "Import config from file, panic if error occurs" as setConfigSubroutine
|
||||||
|
state "Check that config parameters are within acceptable values. Exit if not." as setConfigSubroutine
|
||||||
|
|
||||||
|
state "DataBase Stat and Initialization Subroutine" as dbStatSubroutine
|
||||||
|
state "stat ./data directory. Create it if it doesn't exist." as dbStatSubroutine
|
||||||
|
}
|
||||||
|
state "call http.HandlerFunc(\"/\", whackyHackyHandlerFunc)" as handleFuncCall
|
||||||
|
state "Register whackyHackyHandlerFunc with pattern / to the DefaultServMux." as handleFuncCall
|
||||||
|
state "The / pattern basically matches everything from the root of the URL path." as handleFuncCall
|
||||||
|
state "So here, all incoming requests should be handled by this function." as handleFuncCall
|
||||||
|
|
||||||
|
}
|
577
main.go
577
main.go
|
@ -21,13 +21,43 @@ import (
|
||||||
isbot "zgo.at/isbot"
|
isbot "zgo.at/isbot"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
type ResponseCodeReportingTransport struct {
|
||||||
|
UnderlyingTransport http.RoundTripper
|
||||||
|
ReportingChannel chan string
|
||||||
|
}
|
||||||
|
|
||||||
|
var AllowIPChan chan string
|
||||||
|
|
||||||
|
func (t *ResponseCodeReportingTransport) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
|
||||||
|
response, err := t.UnderlyingTransport.RoundTrip(request)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
verbosityLevelLogPrint("About to check status code for 303 within custom RoundTrip implementation", 5)
|
||||||
|
if response.StatusCode == http.StatusSeeOther {
|
||||||
|
verbosityLevelLogPrint("Found status 303, sending to channel", 5)
|
||||||
|
remoteIPNoPort := request.RemoteAddr[:strings.LastIndex(request.RemoteAddr, ":")]
|
||||||
|
AllowIPChan <- remoteIPNoPort
|
||||||
|
mapMsg := fmt.Sprintf("Sent the following IP to AllowIPChan: %v", remoteIPNoPort)
|
||||||
|
verbosityLevelLogPrint(mapMsg, 5)
|
||||||
|
}
|
||||||
|
verbosityLevelLogPrint("Checked status code for 303.", 5)
|
||||||
|
|
||||||
|
return response, nil
|
||||||
|
}
|
||||||
|
|
||||||
type Config struct {
|
type Config struct {
|
||||||
UpstreamHostname string
|
UpstreamHostname string
|
||||||
UpstreamPort int
|
UpstreamPort int
|
||||||
ListenPort int
|
ListenPort int
|
||||||
|
GlobalRateLimit float64
|
||||||
|
AllowedPaths []string
|
||||||
|
GlobalVerbosityLevel int
|
||||||
TarpitAfterRecentRobotsTxtViolations int
|
TarpitAfterRecentRobotsTxtViolations int
|
||||||
TarpitAfterRobotsTxtViolationsPerSecond float64
|
TarpitAfterRobotsTxtViolationsPerSecond float64
|
||||||
HardBlockBasedOnRobotsTxt bool
|
|
||||||
WhitelistUserAgentRegexes []string
|
WhitelistUserAgentRegexes []string
|
||||||
BlockedUserAgentRegexes []string
|
BlockedUserAgentRegexes []string
|
||||||
BlockedSourceIpPrefixes []BlockedSourceIpPrefix
|
BlockedSourceIpPrefixes []BlockedSourceIpPrefix
|
||||||
|
@ -40,22 +70,63 @@ type BlockedSourceIpPrefix struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type RequestsLog struct {
|
type RequestsLog struct {
|
||||||
|
AllowEverywhere bool `json:allow_everywhere`
|
||||||
TotalRequests int64 `json:total_requests`
|
TotalRequests int64 `json:total_requests`
|
||||||
|
NumAveragedRequests int64 `json:averaged_requests`
|
||||||
RecentRobotsTxtViolations int64 `json:recent_requests`
|
RecentRobotsTxtViolations int64 `json:recent_requests`
|
||||||
RollingAvgRequestRate float64 `json:rolling_average_request_rate`
|
RollingAvgRequestRate float64 `json:rolling_average_request_rate`
|
||||||
TimeOfLastRequest time.Time `json:time_of_last_request`
|
TimeOfLastRequest time.Time `json:time_of_last_request`
|
||||||
|
TimeOfFirstRequest time.Time `json:time_of_first_request`
|
||||||
Blocked time.Time `json:time_of_blocked`
|
Blocked time.Time `json:time_of_blocked`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type LoggedInMap map[string]time.Time
|
||||||
|
|
||||||
var BlockedUserAgentRegexes []*regexp.Regexp
|
var BlockedUserAgentRegexes []*regexp.Regexp
|
||||||
var WhitelistUserAgentRegexes []*regexp.Regexp
|
var WhitelistUserAgentRegexes []*regexp.Regexp
|
||||||
|
var ExplicitlyAllowedPaths []*regexp.Regexp
|
||||||
|
var robotsTxtDisallow []*regexp.Regexp
|
||||||
var db *bolt.DB
|
var db *bolt.DB
|
||||||
var config Config
|
var config Config
|
||||||
|
var GLOBAL_VERBOSITY int
|
||||||
|
var GLOBAL_RATE_LIMIT float64 //in requests per second
|
||||||
|
|
||||||
const MyBlockedSourceIpPrefixIsBotReasonOffset = 40
|
const MyBlockedSourceIpPrefixIsBotReasonOffset = 40
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
|
AllowIPChan = make(chan string, 10)
|
||||||
|
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("GLOBAL_VERBOSITY set to: %v", GLOBAL_VERBOSITY), 5)
|
||||||
|
|
||||||
|
verbosityLevelLogPrint("Before call to configureSelf()", 5)
|
||||||
|
configureSelf()
|
||||||
|
verbosityLevelLogPrint("After call to configureSelf()", 5)
|
||||||
|
|
||||||
|
verbosityLevelLogPrint("Before opening database", 5)
|
||||||
|
var err error
|
||||||
|
db, err = bolt.Open("data/traffic.db", 0600, nil)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatal(err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
verbosityLevelLogPrint("After opening database", 5)
|
||||||
|
|
||||||
|
verbosityLevelLogPrint("Before call to getRobotsDotTxt()", 5)
|
||||||
|
robotsTxtDisallow = getRobotsDotTxt()
|
||||||
|
verbosityLevelLogPrint("After call to getRobotsDotTxt()", 5)
|
||||||
|
|
||||||
|
verbosityLevelLogPrint("Registering whackyHackyHandlerFunc as root handler", 5)
|
||||||
|
http.HandleFunc("/", whackyHackyHandlerFunc)
|
||||||
|
|
||||||
|
listenMessage := fmt.Sprintf("im about to try to listen on port %d and forward to http://%s:%d!\n",
|
||||||
|
config.ListenPort, config.UpstreamHostname, config.UpstreamPort)
|
||||||
|
verbosityLevelLogPrint(listenMessage, 1)
|
||||||
|
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", config.ListenPort), nil))
|
||||||
|
}
|
||||||
|
|
||||||
|
func configureSelf() {
|
||||||
|
|
||||||
config = Config{}
|
config = Config{}
|
||||||
ignoreCommandlineFlags := []string{}
|
ignoreCommandlineFlags := []string{}
|
||||||
err := configlite.ReadConfiguration("config.json", "FCB", ignoreCommandlineFlags, reflect.ValueOf(&config))
|
err := configlite.ReadConfiguration("config.json", "FCB", ignoreCommandlineFlags, reflect.ValueOf(&config))
|
||||||
|
@ -66,11 +137,14 @@ func main() {
|
||||||
configBytes, _ := json.MarshalIndent(config, "", " ")
|
configBytes, _ := json.MarshalIndent(config, "", " ")
|
||||||
log.Println(string(configBytes))
|
log.Println(string(configBytes))
|
||||||
|
|
||||||
|
GLOBAL_VERBOSITY = config.GlobalVerbosityLevel
|
||||||
|
GLOBAL_RATE_LIMIT = config.GlobalRateLimit
|
||||||
|
|
||||||
if config.TarpitAfterRecentRobotsTxtViolations < 10 {
|
if config.TarpitAfterRecentRobotsTxtViolations < 10 {
|
||||||
log.Fatalf("TarpitAfterRecentRobotsTxtViolations (%d) must be greater than 10", config.TarpitAfterRecentRobotsTxtViolations)
|
log.Fatalf("TarpitAfterRecentRobotsTxtViolations (%d) must be greater than 10", config.TarpitAfterRecentRobotsTxtViolations)
|
||||||
}
|
}
|
||||||
if config.TarpitAfterRobotsTxtViolationsPerSecond < 0.001 {
|
if config.TarpitAfterRobotsTxtViolationsPerSecond < 0.001 {
|
||||||
log.Fatalf("TarpitAfterRobotsTxtViolationsPerSecond (%d) must be greater than 0.001", config.TarpitAfterRobotsTxtViolationsPerSecond)
|
log.Fatalf("TarpitAfterRobotsTxtViolationsPerSecond (%f) must be greater than 0.001", config.TarpitAfterRobotsTxtViolationsPerSecond)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = os.Stat("data")
|
_, err = os.Stat("data")
|
||||||
|
@ -82,12 +156,6 @@ func main() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
db, err = bolt.Open("data/traffic.db", 0600, nil)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatal(err)
|
|
||||||
}
|
|
||||||
defer db.Close()
|
|
||||||
|
|
||||||
BlockedUserAgentRegexes = []*regexp.Regexp{}
|
BlockedUserAgentRegexes = []*regexp.Regexp{}
|
||||||
for _, r := range config.BlockedUserAgentRegexes {
|
for _, r := range config.BlockedUserAgentRegexes {
|
||||||
BlockedUserAgentRegexes = append(BlockedUserAgentRegexes, regexp.MustCompile(r))
|
BlockedUserAgentRegexes = append(BlockedUserAgentRegexes, regexp.MustCompile(r))
|
||||||
|
@ -98,88 +166,137 @@ func main() {
|
||||||
WhitelistUserAgentRegexes = append(WhitelistUserAgentRegexes, regexp.MustCompile(r))
|
WhitelistUserAgentRegexes = append(WhitelistUserAgentRegexes, regexp.MustCompile(r))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ExplicitlyAllowedPaths = []*regexp.Regexp{}
|
||||||
|
for _, pathPattern := range config.AllowedPaths {
|
||||||
|
ExplicitlyAllowedPaths = append(ExplicitlyAllowedPaths, regexp.MustCompile(pathPattern))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func lookupHostname(hostname string) string {
|
||||||
|
|
||||||
dnsCacheTTL := time.Second * 5
|
dnsCacheTTL := time.Second * 5
|
||||||
lastDnsResolution := time.Unix(0, 0)
|
lastDnsResolution := time.Unix(0, 0)
|
||||||
backendIPAddreses := []net.IP{}
|
backendIPAddreses := []net.IP{}
|
||||||
lookupHostname := func(hostname string) string {
|
|
||||||
if time.Since(lastDnsResolution) > dnsCacheTTL {
|
|
||||||
var err error
|
|
||||||
backendIPAddreses, err = net.LookupIP(hostname)
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("DNS resolution of hostname (%s) failed!! error: %s", hostname, err)
|
|
||||||
}
|
|
||||||
lastDnsResolution = time.Now()
|
|
||||||
}
|
|
||||||
if backendIPAddreses != nil && len(backendIPAddreses) != 0 {
|
|
||||||
return backendIPAddreses[rand.Intn(len(backendIPAddreses))].String()
|
|
||||||
}
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
|
|
||||||
reverseProxyInstance := &httputil.ReverseProxy{
|
if time.Since(lastDnsResolution) > dnsCacheTTL {
|
||||||
Director: func(request *http.Request) {
|
var err error
|
||||||
ip := lookupHostname(config.UpstreamHostname)
|
backendIPAddreses, err = net.LookupIP(hostname)
|
||||||
address := fmt.Sprintf("%s:%d", ip, config.UpstreamPort)
|
checkErrWarn(err, fmt.Sprintf("DNS resolution of hostname (%s) failed!! error: %s", hostname, err), 1)
|
||||||
request.Host = address
|
lastDnsResolution = time.Now()
|
||||||
request.URL.Scheme = "http"
|
}
|
||||||
request.URL.Host = address
|
if backendIPAddreses != nil && len(backendIPAddreses) != 0 {
|
||||||
log.Printf("connecting to %s...\n", address)
|
return backendIPAddreses[rand.Intn(len(backendIPAddreses))].String()
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func reverseProxyDirectorFunction(request *http.Request) {
|
||||||
|
ip := lookupHostname(config.UpstreamHostname)
|
||||||
|
address := fmt.Sprintf("%s:%d", ip, config.UpstreamPort)
|
||||||
|
request.Host = address
|
||||||
|
request.URL.Scheme = "http"
|
||||||
|
request.URL.Host = address
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("connecting to %s...\n", address), 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
// symys to forest: I put this handler into its own named function,
|
||||||
|
// mainly because a work of art deserves a title
|
||||||
|
func whackyHackyHandlerFunc(responseWriter http.ResponseWriter, request *http.Request) {
|
||||||
|
|
||||||
|
verbosityLevelLogPrint("whackyHackyHandlerFunc called", 5)
|
||||||
|
|
||||||
|
reverseProxyInstance := httputil.ReverseProxy{
|
||||||
|
Director: reverseProxyDirectorFunction,
|
||||||
|
Transport: &ResponseCodeReportingTransport{
|
||||||
|
UnderlyingTransport: http.DefaultTransport,
|
||||||
},
|
},
|
||||||
Transport: http.DefaultTransport,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//robotsTxtDisallow := []regexp.Regexp{}
|
remoteIp := getRemoteIP(request)
|
||||||
robotsTxtDisallow := getRobotsDotTxt()
|
userAgent := request.Header.Get("User-Agent")
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("userAgnt from whackyHackyHandlerFunc: %v", userAgent), 5)
|
||||||
http.HandleFunc("/", func(responseWriter http.ResponseWriter, request *http.Request) {
|
isBotResult := isbot.Bot(request)
|
||||||
|
if isbot.Is(isBotResult) {
|
||||||
remoteIp := getRemoteIP(request)
|
for _, r := range WhitelistUserAgentRegexes {
|
||||||
userAgent := request.Header.Get("User-Agent")
|
if r.MatchString(userAgent) {
|
||||||
isBotResult := isbot.Bot(request)
|
isBotResult = isbot.NoBotKnown
|
||||||
if isbot.Is(isBotResult) {
|
|
||||||
for _, r := range WhitelistUserAgentRegexes {
|
|
||||||
if r.MatchString(userAgent) {
|
|
||||||
isBotResult = isbot.NoBotKnown
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if isBotResult == isbot.NoBotNoMatch {
|
if isBotResult == isbot.NoBotNoMatch {
|
||||||
isBotResult = myIsBot(remoteIp, userAgent)
|
isBotResult = myIsBot(remoteIp, userAgent)
|
||||||
|
}
|
||||||
|
|
||||||
|
isBotReason := getIsBotReason(isBotResult)
|
||||||
|
|
||||||
|
botsAllowed := true
|
||||||
|
for _, r := range robotsTxtDisallow {
|
||||||
|
thisRegexpMsg := fmt.Sprintf("Checking URL pattern: %v against regexp: %v", request.URL.Path, r)
|
||||||
|
verbosityLevelLogPrint(thisRegexpMsg, 5)
|
||||||
|
if r.MatchString(request.URL.Path) {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("Request matched disallowed pattern: %v", r), 5)
|
||||||
|
botsAllowed = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
botsException := false
|
||||||
|
for _, r := range ExplicitlyAllowedPaths {
|
||||||
|
thisRegexpMsg := fmt.Sprintf("Checking URL pattern: %v against regexp: %v", request.URL.Path, r)
|
||||||
|
verbosityLevelLogPrint(thisRegexpMsg, 5)
|
||||||
|
if r.MatchString(request.URL.Path) {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("Request matched allowed pattern: %v", r), 5)
|
||||||
|
botsException = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
blockedTime, count, sleepTimeNeeded, allowedEverywhere := incrementTrafficDB(remoteIp, request.URL.Path, isbot.Is(isBotResult), botsAllowed)
|
||||||
|
|
||||||
|
if allowedEverywhere {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("%v is allowed everywhere as it was previously served a 303.", remoteIp), 5)
|
||||||
|
reverseProxyInstance.ServeHTTP(responseWriter, request)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if botsException || botsAllowed || isbot.IsNot(isBotResult) {
|
||||||
|
sleepyTime := time.Duration(sleepTimeNeeded*1000) * time.Millisecond
|
||||||
|
if sleepyTime > 0 {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("Need to sleep for: %v , to comply with rate limit", sleepyTime), 5)
|
||||||
|
time.Sleep(sleepyTime)
|
||||||
|
} else {
|
||||||
|
verbosityLevelLogPrint("Request looks fine, but not logged in, so subject to rate limit. It's staying within the rate limit on its own.", 4)
|
||||||
|
}
|
||||||
|
reverseProxyInstance.ServeHTTP(responseWriter, request)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
hoursInAMonth := float64(24 * 30)
|
||||||
|
blocked := time.Since(blockedTime).Hours() < hoursInAMonth
|
||||||
|
|
||||||
|
log.Printf(
|
||||||
|
"%s BLOCKED(%d):%t botsAllowed:%t isBot:%t %s %s because %s, %s",
|
||||||
|
remoteIp, int(count), blocked, botsAllowed, isbot.Is(isBotResult), request.Method, request.URL.Path, isBotReason, userAgent,
|
||||||
|
)
|
||||||
|
|
||||||
|
if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" {
|
||||||
|
tarpitLogMsg := fmt.Sprintf("Tarpit activated for %v", remoteIp)
|
||||||
|
verbosityLevelLogPrint(tarpitLogMsg, 2)
|
||||||
|
bible, err := os.Open("bible.txt")
|
||||||
|
if err != nil {
|
||||||
|
http.Error(responseWriter, `1:0 Welcome to our bad bot behaviour management tarpit.
|
||||||
|
You are here because you requested over 101 "below-surface-level" pages
|
||||||
|
in under 16 minutes. Your IP address will be blocked for 1 month.
|
||||||
|
If you wish to have it unblocked, please email ops@cyberia.club`, http.StatusTeapot)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
isBotReason := getIsBotReason(isBotResult)
|
responseWriter.Header().Set("Content-Type", "text/html")
|
||||||
|
//Uncomment to give the bots a little heads-up
|
||||||
|
//responseWriter.WriteHeader(http.StatusTeapot)
|
||||||
|
|
||||||
botsAllowed := true
|
fmt.Fprint(responseWriter, `<!DOCTYPE HTML>
|
||||||
for _, r := range robotsTxtDisallow {
|
|
||||||
if r.MatchString(request.URL.Path) {
|
|
||||||
botsAllowed = false
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
blockedTime, count := incrementTrafficDB(remoteIp, request.URL.Path, isbot.Is(isBotResult), botsAllowed)
|
|
||||||
hoursInAMonth := float64(24 * 30)
|
|
||||||
blocked := time.Since(blockedTime).Hours() < hoursInAMonth
|
|
||||||
|
|
||||||
log.Printf(
|
|
||||||
"%s BLOCKED(%d):%t botsAllowed:%t isBot:%t %s %s because %s, %s",
|
|
||||||
remoteIp, int(count), blocked, botsAllowed, isbot.Is(isBotResult), request.Method, request.URL.Path, isBotReason, userAgent,
|
|
||||||
)
|
|
||||||
|
|
||||||
if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" {
|
|
||||||
bible, err := os.Open("bible.txt")
|
|
||||||
if err != nil {
|
|
||||||
http.Error(responseWriter, `1:0 Welcome to our bad bot behaviour management tarpit.
|
|
||||||
You are here because you requested over 101 "below-surface-level" pages
|
|
||||||
in under 16 minutes. Your IP address will be blocked for 1 month.
|
|
||||||
If you wish to have it unblocked, please email ops@cyberia.club`, http.StatusForbidden)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
responseWriter.Header().Set("Content-Type", "text/html")
|
|
||||||
|
|
||||||
fmt.Fprint(responseWriter, `<!DOCTYPE HTML>
|
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<meta charset="utf-8">
|
<meta charset="utf-8">
|
||||||
|
@ -187,124 +304,174 @@ func main() {
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
`)
|
`)
|
||||||
|
|
||||||
for {
|
for {
|
||||||
bufferSize := 1 + (rand.Int() % 160)
|
bufferSize := 1 + (rand.Int() % 160)
|
||||||
buffer := make([]byte, bufferSize)
|
buffer := make([]byte, bufferSize)
|
||||||
_, err := bible.Read(buffer)
|
_, err := bible.Read(buffer)
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
return
|
return
|
||||||
}
|
|
||||||
|
|
||||||
fmt.Fprint(responseWriter, string(buffer))
|
|
||||||
|
|
||||||
f, ok := responseWriter.(http.Flusher)
|
|
||||||
if ok && f != nil {
|
|
||||||
f.Flush()
|
|
||||||
} else {
|
|
||||||
log.Println("no flusher?")
|
|
||||||
}
|
|
||||||
time.Sleep(time.Millisecond * time.Duration(1+(rand.Int()%2000)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return
|
fmt.Fprint(responseWriter, string(buffer))
|
||||||
|
|
||||||
|
f, ok := responseWriter.(http.Flusher)
|
||||||
|
if ok && f != nil {
|
||||||
|
f.Flush()
|
||||||
|
} else {
|
||||||
|
log.Println("no flusher?")
|
||||||
|
}
|
||||||
|
time.Sleep(time.Millisecond * time.Duration(1+(rand.Int()%2000)))
|
||||||
}
|
}
|
||||||
|
|
||||||
// if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" {
|
return
|
||||||
// http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
|
}
|
||||||
// return
|
|
||||||
// }
|
if !botsAllowed && isbot.Is(isBotResult) {
|
||||||
|
http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
|
||||||
if config.HardBlockBasedOnRobotsTxt && !botsAllowed && isbot.Is(isBotResult) {
|
return
|
||||||
http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
if botsAllowed || isbot.IsNot(isBotResult) {
|
|
||||||
reverseProxyInstance.ServeHTTP(responseWriter, request)
|
|
||||||
}
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
log.Printf(
|
|
||||||
"im about to try to listen on port %d and forward to http://%s:%d!\n",
|
|
||||||
config.ListenPort, config.UpstreamHostname, config.UpstreamPort,
|
|
||||||
)
|
|
||||||
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", config.ListenPort), nil))
|
|
||||||
}
|
|
||||||
|
|
||||||
func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (time.Time, int64) {
|
|
||||||
|
|
||||||
toReturn := time.Time{}
|
|
||||||
toReturn2 := int64(0)
|
|
||||||
err := db.Update(func(tx *bolt.Tx) error {
|
|
||||||
bucket, err := tx.CreateBucketIfNotExists([]byte("ip"))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
ipPathBucket, err := tx.CreateBucketIfNotExists([]byte("ip_path"))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
ipKey := []byte(fmt.Sprintf("%s", remoteIp))
|
|
||||||
ipBytes := bucket.Get(ipKey)
|
|
||||||
ip := RequestsLog{}
|
|
||||||
if ipBytes != nil {
|
|
||||||
json.Unmarshal(ipBytes, &ip)
|
|
||||||
}
|
|
||||||
|
|
||||||
ipPathKey := []byte(fmt.Sprintf("%s_%s", remoteIp, requestPath))
|
|
||||||
ipPathBytes := ipPathBucket.Get(ipPathKey)
|
|
||||||
totalVisits := uint64(0)
|
|
||||||
if ipPathBytes != nil {
|
|
||||||
totalVisits = binary.LittleEndian.Uint64(ipPathBytes)
|
|
||||||
}
|
|
||||||
|
|
||||||
totalVisits += 1
|
|
||||||
secondsSinceLastRequest := time.Since(ip.TimeOfLastRequest).Seconds()
|
|
||||||
ip.TimeOfLastRequest = time.Now()
|
|
||||||
if secondsSinceLastRequest > ip.RollingAvgRequestRate {
|
|
||||||
ip.RollingAvgRequestRate = 0
|
|
||||||
ip.RecentRobotsTxtViolations = 0
|
|
||||||
} else {
|
|
||||||
ip.RollingAvgRequestRate -= secondsSinceLastRequest
|
|
||||||
}
|
|
||||||
ip.TotalRequests += 1
|
|
||||||
if botsAllowed == false {
|
|
||||||
ip.RecentRobotsTxtViolations += 1
|
|
||||||
}
|
|
||||||
ip.RollingAvgRequestRate += float64(1)
|
|
||||||
if ip.RecentRobotsTxtViolations > int64(config.TarpitAfterRecentRobotsTxtViolations) && ip.RollingAvgRequestRate > config.TarpitAfterRobotsTxtViolationsPerSecond {
|
|
||||||
ip.Blocked = time.Now()
|
|
||||||
}
|
|
||||||
toReturn = ip.Blocked
|
|
||||||
toReturn2 = ip.RecentRobotsTxtViolations
|
|
||||||
|
|
||||||
buffer := make([]byte, 8)
|
|
||||||
binary.LittleEndian.PutUint64(buffer, totalVisits)
|
|
||||||
|
|
||||||
newIpBytes, err := json.Marshal(ip)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
err = bucket.Put(ipKey, newIpBytes)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
return ipPathBucket.Put(ipPathKey, buffer)
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
log.Printf("boltdb error: %v\n", err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return toReturn, toReturn2
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func getRobotsDotTxt() []*regexp.Regexp {
|
type LogUpdate struct {
|
||||||
|
ForIP string
|
||||||
|
AllowSignal bool
|
||||||
|
RequestPath string
|
||||||
|
BotsAllowed bool
|
||||||
|
SleepTimeNeededReturn float64
|
||||||
|
BlockedReturn time.Time
|
||||||
|
RecentRobotsViolationsReturn int64
|
||||||
|
AllowedEverywhereReturn bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (logUpdateIn *LogUpdate) UpdateTheLog(tx *bolt.Tx) error {
|
||||||
|
verbosityLevelLogPrint("calling db.Update", 5)
|
||||||
|
bucket, err := tx.CreateBucketIfNotExists([]byte("ip"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
ipPathBucket, err := tx.CreateBucketIfNotExists([]byte("ip_path"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
ipKey := []byte(fmt.Sprintf("%s", logUpdateIn.ForIP))
|
||||||
|
ipBytes := bucket.Get(ipKey)
|
||||||
|
ip := RequestsLog{}
|
||||||
|
if ipBytes != nil {
|
||||||
|
json.Unmarshal(ipBytes, &ip)
|
||||||
|
}
|
||||||
|
|
||||||
|
ipPathKey := []byte(fmt.Sprintf("%s_%s", logUpdateIn.ForIP, logUpdateIn.RequestPath))
|
||||||
|
ipPathBytes := ipPathBucket.Get(ipPathKey)
|
||||||
|
totalVisits := uint64(0)
|
||||||
|
if ipPathBytes != nil {
|
||||||
|
totalVisits = binary.LittleEndian.Uint64(ipPathBytes)
|
||||||
|
}
|
||||||
|
|
||||||
|
totalVisits += 1
|
||||||
|
secondsSinceLastRequest := time.Since(ip.TimeOfLastRequest).Seconds()
|
||||||
|
ip.TimeOfLastRequest = time.Now()
|
||||||
|
if secondsSinceLastRequest > ip.RollingAvgRequestRate {
|
||||||
|
ip.TimeOfFirstRequest = time.Now()
|
||||||
|
ip.NumAveragedRequests = 1
|
||||||
|
ip.RollingAvgRequestRate = 0
|
||||||
|
ip.RecentRobotsTxtViolations = 0
|
||||||
|
} else {
|
||||||
|
ip.NumAveragedRequests += 1
|
||||||
|
ip.RollingAvgRequestRate = float64(ip.NumAveragedRequests) / (time.Since(ip.TimeOfFirstRequest).Seconds())
|
||||||
|
}
|
||||||
|
logUpdateIn.SleepTimeNeededReturn = 0 //0.01*float64(runtime.NumGoroutine())*float64(ip.NumAveragedRequests)/GLOBAL_RATE_LIMIT - time.Since(ip.TimeOfFirstRequest).Seconds()
|
||||||
|
|
||||||
|
ip.TotalRequests += 1
|
||||||
|
if logUpdateIn.BotsAllowed == false {
|
||||||
|
ip.RecentRobotsTxtViolations += 1
|
||||||
|
}
|
||||||
|
|
||||||
|
ip.RollingAvgRequestRate += float64(1)
|
||||||
|
if ip.RecentRobotsTxtViolations > int64(config.TarpitAfterRecentRobotsTxtViolations) && ip.RollingAvgRequestRate > config.TarpitAfterRobotsTxtViolationsPerSecond {
|
||||||
|
ip.Blocked = time.Now()
|
||||||
|
}
|
||||||
|
rollingAvgMsg := fmt.Sprintf("ip.RollingAvgRequestRate within incrementTrafficDB: %v", ip.RollingAvgRequestRate)
|
||||||
|
verbosityLevelLogPrint(rollingAvgMsg, 5)
|
||||||
|
robotsViolationsMsg := fmt.Sprintf("ip.RecentRobotsTxtViolations within incrementTrafficDB: %v", ip.RecentRobotsTxtViolations)
|
||||||
|
verbosityLevelLogPrint(robotsViolationsMsg, 5)
|
||||||
|
logUpdateIn.BlockedReturn = ip.Blocked
|
||||||
|
logUpdateIn.RecentRobotsViolationsReturn = ip.RecentRobotsTxtViolations
|
||||||
|
if logUpdateIn.AllowSignal {
|
||||||
|
ip.AllowEverywhere = true
|
||||||
|
}
|
||||||
|
logUpdateIn.AllowedEverywhereReturn = ip.AllowEverywhere
|
||||||
|
|
||||||
|
buffer := make([]byte, 8)
|
||||||
|
binary.LittleEndian.PutUint64(buffer, totalVisits)
|
||||||
|
|
||||||
|
newIpBytes, err := json.Marshal(ip)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = bucket.Put(ipKey, newIpBytes)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return ipPathBucket.Put(ipPathKey, buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (blockedReturn time.Time, recentRobotsViolationsReturn int64, sleepTimeNeededReturn float64, allowedEverywhereReturn bool) {
|
||||||
|
|
||||||
|
recentRobotsViolationsReturn = int64(0)
|
||||||
|
preErrMsg := fmt.Sprintf("incrementTrafficDB called with params (%v, %v, %v, %v)", remoteIp, requestPath, isBot, botsAllowed)
|
||||||
|
verbosityLevelLogPrint(preErrMsg, 5)
|
||||||
|
|
||||||
|
select {
|
||||||
|
case ipSignaledAllow := <-AllowIPChan:
|
||||||
|
verbosityLevelLogPrint("Consumed AllowIPChan value", 5)
|
||||||
|
allowedLogUpdate := LogUpdate{
|
||||||
|
ForIP: ipSignaledAllow,
|
||||||
|
AllowSignal: true,
|
||||||
|
RequestPath: "/",
|
||||||
|
BotsAllowed: true,
|
||||||
|
}
|
||||||
|
allowedLogMsg := fmt.Sprintf("allowedLogUpdate in ipSignaledAllow case, before update: %v", allowedLogUpdate)
|
||||||
|
verbosityLevelLogPrint(allowedLogMsg, 5)
|
||||||
|
err := db.Update(allowedLogUpdate.UpdateTheLog)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("boltdb error: %v\n", err)
|
||||||
|
}
|
||||||
|
postUpdateMsg := fmt.Sprintf("allowedLogUpdate in ipSignaledAllow case, after update: %v", allowedLogUpdate)
|
||||||
|
verbosityLevelLogPrint(postUpdateMsg, 5)
|
||||||
|
default:
|
||||||
|
verbosityLevelLogPrint("Default case: no AllowIPChan value to consume", 5)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
thisLogUpdate := LogUpdate{
|
||||||
|
ForIP: remoteIp,
|
||||||
|
AllowSignal: false,
|
||||||
|
RequestPath: requestPath,
|
||||||
|
BotsAllowed: botsAllowed,
|
||||||
|
}
|
||||||
|
err := db.Update(thisLogUpdate.UpdateTheLog)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("boltdb error: %v\n", err)
|
||||||
|
} else {
|
||||||
|
|
||||||
|
sleepTimeNeededReturn = thisLogUpdate.SleepTimeNeededReturn
|
||||||
|
blockedReturn = thisLogUpdate.BlockedReturn
|
||||||
|
recentRobotsViolationsReturn = thisLogUpdate.RecentRobotsViolationsReturn
|
||||||
|
allowedEverywhereReturn = thisLogUpdate.AllowedEverywhereReturn
|
||||||
|
}
|
||||||
|
incrementTrafficReturnDebugMsg := fmt.Sprintf("allowedEveryWhereReturn just before returning incrementTrafficDB: %v", allowedEverywhereReturn)
|
||||||
|
verbosityLevelLogPrint(incrementTrafficReturnDebugMsg, 5)
|
||||||
|
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func getRobotsDotTxt() (robotsDisallowed []*regexp.Regexp) {
|
||||||
robotsTxtResponse, err := http.Get(fmt.Sprintf("http://%s:%d/robots.txt", config.UpstreamHostname, config.UpstreamPort))
|
robotsTxtResponse, err := http.Get(fmt.Sprintf("http://%s:%d/robots.txt", config.UpstreamHostname, config.UpstreamPort))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf(
|
log.Fatalf(
|
||||||
|
@ -328,7 +495,7 @@ func getRobotsDotTxt() []*regexp.Regexp {
|
||||||
txtStr := string(txtBytes)
|
txtStr := string(txtBytes)
|
||||||
txtLines := strings.Split(txtStr, "\n")
|
txtLines := strings.Split(txtStr, "\n")
|
||||||
|
|
||||||
robotsTxtDisallow := []*regexp.Regexp{}
|
robotsDisallowed = []*regexp.Regexp{}
|
||||||
for _, line := range txtLines {
|
for _, line := range txtLines {
|
||||||
line = strings.ToLower(line)
|
line = strings.ToLower(line)
|
||||||
if strings.HasPrefix(line, "disallow:") {
|
if strings.HasPrefix(line, "disallow:") {
|
||||||
|
@ -337,17 +504,20 @@ func getRobotsDotTxt() []*regexp.Regexp {
|
||||||
expression = strings.ReplaceAll(expression, "*", ".*")
|
expression = strings.ReplaceAll(expression, "*", ".*")
|
||||||
expression = "^" + expression + ".*"
|
expression = "^" + expression + ".*"
|
||||||
|
|
||||||
log.Printf("robotsTxtDisallow: %s\n", expression)
|
log.Printf("robotsDisallowed: %s\n", expression)
|
||||||
robotsTxtDisallow = append(robotsTxtDisallow, regexp.MustCompile(expression))
|
robotsDisallowed = append(robotsDisallowed, regexp.MustCompile(expression))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return robotsTxtDisallow
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
func getRemoteIP(request *http.Request) string {
|
func getRemoteIP(request *http.Request) string {
|
||||||
toReturn := request.Header.Get("X-Forwarded-For")
|
toReturn := request.Header.Get("X-Forwarded-For")
|
||||||
if toReturn == "" {
|
if toReturn == "" {
|
||||||
toReturn = strings.Split(request.RemoteAddr, ":")[0]
|
//LastIndex is needed rather than strings.Split
|
||||||
|
//so this works on IPv6 addresses
|
||||||
|
portColonInd := strings.LastIndex(request.RemoteAddr, ":")
|
||||||
|
toReturn = request.RemoteAddr[:portColonInd]
|
||||||
}
|
}
|
||||||
return toReturn
|
return toReturn
|
||||||
}
|
}
|
||||||
|
@ -356,6 +526,7 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
|
||||||
|
|
||||||
for i, prefix := range config.BlockedSourceIpPrefixes {
|
for i, prefix := range config.BlockedSourceIpPrefixes {
|
||||||
if strings.HasPrefix(remoteIP, prefix.Prefix) {
|
if strings.HasPrefix(remoteIP, prefix.Prefix) {
|
||||||
|
verbosityLevelLogPrint("Judged to be bot in myIsBot based on BlockedSourceIpPrefixes", 5)
|
||||||
return isbot.Result(MyBlockedSourceIpPrefixIsBotReasonOffset + i)
|
return isbot.Result(MyBlockedSourceIpPrefixIsBotReasonOffset + i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -363,6 +534,7 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
|
||||||
|
|
||||||
for _, r := range BlockedUserAgentRegexes {
|
for _, r := range BlockedUserAgentRegexes {
|
||||||
if r.MatchString(userAgent) {
|
if r.MatchString(userAgent) {
|
||||||
|
verbosityLevelLogPrint("Judged to be bot based in myIsBot based on BlockedUserAgentRegexes", 5)
|
||||||
return isbot.BotClientLibrary
|
return isbot.BotClientLibrary
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -371,9 +543,11 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
|
||||||
|
|
||||||
func getIsBotReason(code isbot.Result) string {
|
func getIsBotReason(code isbot.Result) string {
|
||||||
|
|
||||||
|
thisCodeAsUint8 := uint8(code)
|
||||||
|
|
||||||
myReasonMap := map[uint8]string{
|
myReasonMap := map[uint8]string{
|
||||||
0: "Known to not be a bot",
|
0: "Known to not be a bot",
|
||||||
1: "None of the rules matches, so probably not a bot",
|
1: "None of the rules matches, so we assume non-bot",
|
||||||
2: "Prefetch algorithm",
|
2: "Prefetch algorithm",
|
||||||
3: "User-Agent appeared to contain a URL",
|
3: "User-Agent appeared to contain a URL",
|
||||||
4: "Known client library",
|
4: "Known client library",
|
||||||
|
@ -395,5 +569,22 @@ func getIsBotReason(code isbot.Result) string {
|
||||||
myReasonMap[uint8(MyBlockedSourceIpPrefixIsBotReasonOffset+i)] = prefix.ShortDesc
|
myReasonMap[uint8(MyBlockedSourceIpPrefixIsBotReasonOffset+i)] = prefix.ShortDesc
|
||||||
}
|
}
|
||||||
|
|
||||||
return myReasonMap[uint8(code)]
|
verbosityLevelLogPrint(fmt.Sprintf("reason fount by getIsBotReason: %v", myReasonMap[thisCodeAsUint8]), 5)
|
||||||
|
return myReasonMap[thisCodeAsUint8]
|
||||||
|
}
|
||||||
|
|
||||||
|
// A couple of helper functions, mainly for debugging
|
||||||
|
func checkErrWarn(err error, msg string, verbosityLevel int) {
|
||||||
|
if err != nil {
|
||||||
|
if verbosityLevel <= GLOBAL_VERBOSITY {
|
||||||
|
log.Printf("%s: %v\n", msg, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func verbosityLevelLogPrint(msg string, verbosityLevel int) {
|
||||||
|
if verbosityLevel <= GLOBAL_VERBOSITY {
|
||||||
|
log.Printf("%s\n", msg)
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
43
mainDiagram.mermaid
Normal file
43
mainDiagram.mermaid
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
stateDiagram-v2
|
||||||
|
direction TB
|
||||||
|
|
||||||
|
accTitle: crawler-blocker Diagram
|
||||||
|
accDescr: Diagram of main.go
|
||||||
|
|
||||||
|
|
||||||
|
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
|
||||||
|
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
|
||||||
|
class nixShellEnvironment nixShell
|
||||||
|
class setConfigSubroutine, nixShellExitSubroutine subroutine
|
||||||
|
class mainFunction systemShell
|
||||||
|
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
|
||||||
|
|
||||||
|
preEntryState --> mainFunction
|
||||||
|
setConfigSubroutine --> dbStatSubroutine
|
||||||
|
callToConfigSelf --> handleFuncCall
|
||||||
|
mainFunction --> postExitState
|
||||||
|
|
||||||
|
state "Define types. For convenience and clarity." as preEntryState
|
||||||
|
state "Define global variables and constants. These make it easier to pass data between some of the other functions below." as preEntryState
|
||||||
|
state "entrypoint to main" as preEntryState
|
||||||
|
state "main exits and control is returned to the calling environment" as postExitState
|
||||||
|
state "main" as mainFunction {
|
||||||
|
|
||||||
|
state "configureSelf() is called" as callToConfigSelf {
|
||||||
|
|
||||||
|
state "Configuration Subroutine" as setConfigSubroutine
|
||||||
|
state "Import config from file, panic if error occurs" as setConfigSubroutine
|
||||||
|
state "Check that config parameters are within acceptable values. Exit if not." as setConfigSubroutine
|
||||||
|
|
||||||
|
state "DataBase Stat and Initialization Subroutine" as dbStatSubroutine
|
||||||
|
state "stat ./data directory. Create it if it doesn't exist." as dbStatSubroutine
|
||||||
|
}
|
||||||
|
state "call to http.HandlerFunc" as handleFuncCall
|
||||||
|
state "Register whackyHackyHandlerFunc with pattern / to the DefaultServMux." as handleFuncCall
|
||||||
|
state "The / pattern basically matches everything from the root of the URL path." as handleFuncCall
|
||||||
|
state "So here, all incoming requests should be handled by this function." as handleFuncCall
|
||||||
|
|
||||||
|
}
|
BIN
pkgnamezzz
Executable file
BIN
pkgnamezzz
Executable file
Binary file not shown.
214
pkgnamezzz_test.go
Normal file
214
pkgnamezzz_test.go
Normal file
|
@ -0,0 +1,214 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type EditableUserAgentTransport struct {
|
||||||
|
CustomUserAgent string
|
||||||
|
UnderlyingTransport http.RoundTripper
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewEditableUserAgentTransport(desiredUserAgent string) EditableUserAgentTransport {
|
||||||
|
transportOut := EditableUserAgentTransport{
|
||||||
|
CustomUserAgent: desiredUserAgent,
|
||||||
|
UnderlyingTransport: http.DefaultTransport,
|
||||||
|
}
|
||||||
|
return transportOut
|
||||||
|
}
|
||||||
|
|
||||||
|
func (t *EditableUserAgentTransport) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
|
||||||
|
request.Header["User-Agent"] = []string{t.CustomUserAgent}
|
||||||
|
response, err := t.UnderlyingTransport.RoundTrip(request)
|
||||||
|
if err != nil {
|
||||||
|
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return response, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
func TestGetRobotsDotTxt(t *testing.T) {
|
||||||
|
fmt.Println("Running test TestGetRobotsDotTxt")
|
||||||
|
//It's assumed configureSelf() will take longer than setting up
|
||||||
|
//the server in runSimpleServer(), so no additional logic has been
|
||||||
|
//added to ensure the server is running before attempting to use it
|
||||||
|
//Should be fine, as it errs on the side of failure anyway.
|
||||||
|
go runSimpleServer(":3000", "TestGetRobotsDotTxt")
|
||||||
|
configureSelf()
|
||||||
|
|
||||||
|
thisRobots := getRobotsDotTxt()
|
||||||
|
robotsMsg := fmt.Sprintf("robots.txt: %v\n", thisRobots)
|
||||||
|
log.Printf(robotsMsg)
|
||||||
|
thisShouldMatch := "http://localhost:3002/respond200"
|
||||||
|
thisShouldNotMatch := "https://localhost:3002/respond303"
|
||||||
|
|
||||||
|
doesActuallyMatch := false
|
||||||
|
shouldNotMatchButDoes := false
|
||||||
|
for _, r := range thisRobots {
|
||||||
|
if r.MatchString(thisShouldMatch) {
|
||||||
|
doesActuallyMatch = true
|
||||||
|
}
|
||||||
|
if r.MatchString(thisShouldNotMatch) {
|
||||||
|
shouldNotMatchButDoes = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !doesActuallyMatch {
|
||||||
|
t.Errorf("Pattern %v should match robots regexp but does not.", thisShouldMatch)
|
||||||
|
}
|
||||||
|
if shouldNotMatchButDoes {
|
||||||
|
t.Errorf("Pattern %v should NOT match robots regexp, but it does.", thisShouldNotMatch)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
type ReqBoxForTest struct {
|
||||||
|
TestURL string
|
||||||
|
StatusWant int
|
||||||
|
CustomClient http.Client
|
||||||
|
SleepTime time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
func (rb ReqBoxForTest) DoRequestStuff(t *testing.T) (measuredFetchTime time.Duration) {
|
||||||
|
|
||||||
|
testURL := rb.TestURL
|
||||||
|
testURLMsg := fmt.Sprintf("testURL: %v", testURL)
|
||||||
|
verbosityLevelLogPrint(testURLMsg, 5)
|
||||||
|
thisClient := rb.CustomClient
|
||||||
|
timeBeforeGet := time.Now()
|
||||||
|
thisResponse, err := thisClient.Get(testURL)
|
||||||
|
measuredFetchTime = time.Since(timeBeforeGet)
|
||||||
|
statusWant := rb.StatusWant
|
||||||
|
statusGot := thisResponse.StatusCode
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error getting %s: %v", testURL, err)
|
||||||
|
} else if statusWant != statusGot {
|
||||||
|
t.Errorf("Test should fail: Should have gotten status %v but got %v!", statusWant, statusGot)
|
||||||
|
}
|
||||||
|
fmt.Printf("thisResponse: %v\n", thisResponse)
|
||||||
|
time.Sleep(rb.SleepTime)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestMain(t *testing.T) {
|
||||||
|
GLOBAL_VERBOSITY = 5
|
||||||
|
fmt.Println("Running test TestMain")
|
||||||
|
go runSimpleServer(":3000", "TestMain")
|
||||||
|
|
||||||
|
commonRealUserAgent := "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
|
||||||
|
thisTransport := NewEditableUserAgentTransport(commonRealUserAgent)
|
||||||
|
thisClient := http.Client{Transport: &thisTransport}
|
||||||
|
go main()
|
||||||
|
|
||||||
|
//Sleep while main sets up the reverse proxy
|
||||||
|
//Inelegant, but errs on the side of failure
|
||||||
|
//so its okay
|
||||||
|
time.Sleep(1 * time.Second)
|
||||||
|
|
||||||
|
testCaseNormalUserLike := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseNormalUserLike.DoRequestStuff(t)
|
||||||
|
|
||||||
|
testCaseMaliciousUser := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
numRequestsDone := 0
|
||||||
|
testDuration := time.Duration(0)
|
||||||
|
for i := 0; i < 20; i++ {
|
||||||
|
testDuration += testCaseMaliciousUser.DoRequestStuff(t)
|
||||||
|
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
|
||||||
|
verbosityLevelLogPrint(durationMsg, 5)
|
||||||
|
numRequestsDone++
|
||||||
|
}
|
||||||
|
|
||||||
|
testRate := float64(numRequestsDone) / testDuration.Seconds()
|
||||||
|
totalDurationMsg := fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
|
||||||
|
verbosityLevelLogPrint(totalDurationMsg, 5)
|
||||||
|
if testRate > GLOBAL_RATE_LIMIT+1 {
|
||||||
|
t.Errorf("MaliciousUser test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
|
||||||
|
}
|
||||||
|
|
||||||
|
sillyFakeUserAgent := "I am 3vil and undetectable 8D"
|
||||||
|
thisTransport = NewEditableUserAgentTransport(sillyFakeUserAgent)
|
||||||
|
thisClient = http.Client{Transport: &thisTransport}
|
||||||
|
|
||||||
|
maxNumRobotsViolations := config.TarpitAfterRecentRobotsTxtViolations
|
||||||
|
desiredRateLimit := config.TarpitAfterRobotsTxtViolationsPerSecond
|
||||||
|
neededSleepTime := 1000.0 / (desiredRateLimit + 1.0)
|
||||||
|
rateLimitMsg := fmt.Sprintf("desiredRateLimit: %v, so sleep time should be %v", desiredRateLimit, neededSleepTime)
|
||||||
|
verbosityLevelLogPrint(rateLimitMsg, 5)
|
||||||
|
|
||||||
|
testCaseDumbBotLike := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 403,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||||
|
}
|
||||||
|
for reqIndex := 0; reqIndex < maxNumRobotsViolations-21; reqIndex++ {
|
||||||
|
testCaseDumbBotLike.DoRequestStuff(t)
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseDumbBotPassesThreshold := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||||
|
}
|
||||||
|
testCaseDumbBotPassesThreshold.DoRequestStuff(t)
|
||||||
|
|
||||||
|
testCasePushRateLimit := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond200robotsallowed",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
numRequestsDone = 0
|
||||||
|
testDuration = time.Duration(0)
|
||||||
|
for i := 0; i < 20; i++ {
|
||||||
|
testDuration += testCasePushRateLimit.DoRequestStuff(t)
|
||||||
|
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
|
||||||
|
verbosityLevelLogPrint(durationMsg, 5)
|
||||||
|
numRequestsDone++
|
||||||
|
}
|
||||||
|
|
||||||
|
testRate = float64(numRequestsDone) / testDuration.Seconds()
|
||||||
|
totalDurationMsg = fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
|
||||||
|
verbosityLevelLogPrint(totalDurationMsg, 5)
|
||||||
|
if testRate > GLOBAL_RATE_LIMIT+1 {
|
||||||
|
t.Errorf("Test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseUserLoggingIn := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond303",
|
||||||
|
StatusWant: 303,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
testCaseUserLoggingIn.DoRequestStuff(t)
|
||||||
|
|
||||||
|
testCaseUserAfterLoggedIn := ReqBoxForTest{
|
||||||
|
TestURL: "http://localhost:3002/respond200",
|
||||||
|
StatusWant: 200,
|
||||||
|
CustomClient: thisClient,
|
||||||
|
SleepTime: 0 * time.Second,
|
||||||
|
}
|
||||||
|
|
||||||
|
testCaseUserAfterLoggedIn.DoRequestStuff(t)
|
||||||
|
|
||||||
|
}
|
16
rolling_avg_calculation.txt
Normal file
16
rolling_avg_calculation.txt
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
requestRate = requests/second
|
||||||
|
avgRequestRate = numberOfRequests/TotalTimeLoggingRequests
|
||||||
|
|
||||||
|
to be within limit, need:
|
||||||
|
avgRequestRate <= globalLimit
|
||||||
|
or:
|
||||||
|
numRequests/TotalTime <= globalLimit
|
||||||
|
numRequests/globalLimit <= TotalTime
|
||||||
|
|
||||||
|
We can control it with sleep time, so
|
||||||
|
|
||||||
|
numRequests/globalLimit <= TotalTime + SleepTime
|
||||||
|
so
|
||||||
|
numRequests/globalLimit - TotalTime <= SleepTime
|
||||||
|
|
||||||
|
We'll go with equal here, but we could also do a randrange with the left hand side as the minimum.
|
70
simpleServer.go
Normal file
70
simpleServer.go
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"io/ioutil"
|
||||||
|
"log"
|
||||||
|
"net/http"
|
||||||
|
)
|
||||||
|
|
||||||
|
func rootHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
data := "Any page other than /respond303 or /respond200 should end up here"
|
||||||
|
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||||
|
fmt.Fprint(w, string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
func robotsHandler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
fmt.Printf("robotsHandler called\n")
|
||||||
|
w.Header().Set("Content-Type", "text/html")
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
data, err := ioutil.ReadFile("testRobots.txt")
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||||
|
fmt.Fprint(w, string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
func respond303Handler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html")
|
||||||
|
w.WriteHeader(303)
|
||||||
|
data := "You should get a 303 status!"
|
||||||
|
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||||
|
fmt.Fprint(w, string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
func respond200Handler(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Content-Type", "text/html")
|
||||||
|
w.WriteHeader(200)
|
||||||
|
data := "You should get a 200 status!"
|
||||||
|
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||||
|
fmt.Fprint(w, string(data))
|
||||||
|
}
|
||||||
|
|
||||||
|
type customHandlerToAvoidDefaultServeMux struct {
|
||||||
|
NondefaultServeMux *http.ServeMux
|
||||||
|
}
|
||||||
|
|
||||||
|
func (customHandler customHandlerToAvoidDefaultServeMux) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("call to customHandlerToAvoidDefaultServeMux with request URL: %v", req.URL), 5)
|
||||||
|
customHandler.NondefaultServeMux.ServeHTTP(rw, req)
|
||||||
|
}
|
||||||
|
|
||||||
|
func runSimpleServer(portIn string, callingFunc string) {
|
||||||
|
GLOBAL_VERBOSITY = 5
|
||||||
|
//Need to define a new ServeMux to disambiguate from the main() server
|
||||||
|
testServeMux := http.NewServeMux()
|
||||||
|
testHandler := customHandlerToAvoidDefaultServeMux{NondefaultServeMux: testServeMux}
|
||||||
|
|
||||||
|
testHandler.NondefaultServeMux.HandleFunc("/", rootHandler)
|
||||||
|
testHandler.NondefaultServeMux.HandleFunc("/respond303", respond303Handler)
|
||||||
|
testHandler.NondefaultServeMux.HandleFunc("/respond200", respond200Handler)
|
||||||
|
testHandler.NondefaultServeMux.HandleFunc("/respond200RobotsAllowed", respond200Handler)
|
||||||
|
testHandler.NondefaultServeMux.HandleFunc("/robots.txt", robotsHandler)
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("runSimpleServer called by %v\n", callingFunc), 5)
|
||||||
|
verbosityLevelLogPrint(fmt.Sprintf("About to ListenAndServe with params ( %v , %v )", portIn, testHandler), 5)
|
||||||
|
log.Fatal(http.ListenAndServe(portIn, testHandler))
|
||||||
|
|
||||||
|
}
|
4
testRobots.txt
Normal file
4
testRobots.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
disallow: https://localhost:8080
|
||||||
|
disallow: http://localhost:3002/respond200
|
||||||
|
disallow: http://localhost:3000/respond200
|
||||||
|
disallow: /respond200
|
2899
testlog.txt
Normal file
2899
testlog.txt
Normal file
File diff suppressed because it is too large
Load diff
31
whackyHackyHandlerFunc.mermaid
Normal file
31
whackyHackyHandlerFunc.mermaid
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
stateDiagram-v2
|
||||||
|
direction TB
|
||||||
|
|
||||||
|
accTitle: crawler-blocker Diagram
|
||||||
|
accDescr: Diagram of main.go
|
||||||
|
|
||||||
|
|
||||||
|
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
|
||||||
|
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
|
||||||
|
|
||||||
|
class nixShellEnvironment nixShell
|
||||||
|
class setConfigSubroutine, nixShellExitSubroutine subroutine
|
||||||
|
class mainFunction systemShell
|
||||||
|
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
|
||||||
|
|
||||||
|
preEntryState --> mainFunction
|
||||||
|
setConfigSubroutine --> setConfigSubroutine
|
||||||
|
|
||||||
|
state "DefaultServeMux (I think) listens on ports 443 and 80, or something" as preEntryState
|
||||||
|
state "HTTP request on 443 triggers entrypoint to whackyHackyHandlerFunc" as preEntryState
|
||||||
|
|
||||||
|
state "whackyHackyHandlerFunc" as whhf {
|
||||||
|
state "set reverseProxyInstance, of type ModifiedReverseProxy." as instantiateReverseProxy
|
||||||
|
state "This type composes ('ineherits' , but not technically somehow) the ReverseProxy type from the stdlib." as instantiateReverseProxy
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
state "return from whackyHackyHandlerFunc, probably killing a goroutine that was spawned by DefaultServeMux." as return
|
Loading…
Reference in a new issue