symys-add-loggedin-dontblock #1
17 changed files with 8898 additions and 99857 deletions
14
ReadMe.md
14
ReadMe.md
|
@ -1,13 +1,23 @@
|
|||
# forgejo-crawler-blocker
|
||||
|
||||
What does a GPT training web-crawler see when it tries to access our forgejo instance and look at every single file at every single commit, ignoring `robots.txt` and sending a generic user-agent header?
|
||||
|
||||
What can
|
||||
I say except
|
||||
Hail Eris,
|
||||
All Hail Discordia!
|
||||
|
||||
|
||||
# maintenance
|
||||
|
||||
if anyone needs to clear the data to unblock someone, these are the commands to run on paimon:
|
||||
|
||||
`sudo -i`
|
||||
|
||||
```
|
||||
docker stop gitea_forgejo-crawler-blocker_1
|
||||
docker stop gitea_forgejo-crawler-blocker
|
||||
rm /etc/docker-compose/gitea/forgejo-crawler-blocker/traffic.db
|
||||
docker start gitea_forgejo-crawler-blocker_1
|
||||
docker start gitea_forgejo-crawler-blocker
|
||||
```
|
||||
|
||||
# persistent data storage
|
||||
|
|
11
config.json
11
config.json
|
@ -1,15 +1,20 @@
|
|||
{
|
||||
"UpstreamHostname": "gitea",
|
||||
"UpstreamHostname": "localhost",
|
||||
"UpstreamPort": 3000,
|
||||
"ListenPort": 3002,
|
||||
"TarpitAfterRecentRobotsTxtViolations": 100,
|
||||
"TarpitAfterRobotsTxtViolationsPerSecond": 0.1,
|
||||
"TarpitAfterRobotsTxtViolationsPerSecond": 20.0,
|
||||
"GlobalRateLimit": 5.0,
|
||||
"AllowedPaths": [
|
||||
"/respond303",
|
||||
"respond200RobotsAllowed"
|
||||
],
|
||||
"GlobalVerbosityLevel":5,
|
||||
"BlockedUserAgentRegexes": [
|
||||
"Python/[^ ]+ aiohttp/.*"
|
||||
],
|
||||
"WhitelistUserAgentRegexes": [
|
||||
"^git/[0-9\\.]*$",
|
||||
"^JGit/[0-9a-z\\.]*$",
|
||||
"https://github.com/matrix-org/synapse"
|
||||
],
|
||||
"BlockedSourceIpPrefixes": [
|
||||
|
|
198
crawlpit_test.go
Normal file
198
crawlpit_test.go
Normal file
|
@ -0,0 +1,198 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
type EditableUserAgentTransport struct {
|
||||
CustomUserAgent string
|
||||
UnderlyingTransport http.RoundTripper
|
||||
}
|
||||
|
||||
func NewEditableUserAgentTransport(desiredUserAgent string) EditableUserAgentTransport {
|
||||
transportOut := EditableUserAgentTransport{
|
||||
CustomUserAgent: desiredUserAgent,
|
||||
UnderlyingTransport: http.DefaultTransport,
|
||||
}
|
||||
return transportOut
|
||||
}
|
||||
|
||||
func (t *EditableUserAgentTransport) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
|
||||
request.Header["User-Agent"] = []string{t.CustomUserAgent}
|
||||
response, err := t.UnderlyingTransport.RoundTrip(request)
|
||||
if err != nil {
|
||||
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
|
||||
return nil, err
|
||||
}
|
||||
return response, nil
|
||||
}
|
||||
|
||||
/*
|
||||
func TestGetRobotsDotTxt(t *testing.T) {
|
||||
fmt.Println("Running test TestGetRobotsDotTxt")
|
||||
//It's assumed configureSelf() will take longer than setting up
|
||||
//the server in runSimpleServer(), so no additional logic has been
|
||||
//added to ensure the server is running before attempting to use it
|
||||
//Should be fine, as it errs on the side of failure anyway.
|
||||
go runSimpleServer(":3000", "TestGetRobotsDotTxt")
|
||||
configureSelf()
|
||||
|
||||
thisRobots := getRobotsDotTxt()
|
||||
robotsMsg := fmt.Sprintf("robots.txt: %v\n", thisRobots)
|
||||
log.Printf(robotsMsg)
|
||||
thisShouldMatch := "http://localhost:3002/respond200"
|
||||
thisShouldNotMatch := "https://localhost:3002/respond303"
|
||||
|
||||
doesActuallyMatch := false
|
||||
shouldNotMatchButDoes := false
|
||||
for _, r := range thisRobots {
|
||||
if r.MatchString(thisShouldMatch) {
|
||||
doesActuallyMatch = true
|
||||
}
|
||||
if r.MatchString(thisShouldNotMatch) {
|
||||
shouldNotMatchButDoes = true
|
||||
}
|
||||
}
|
||||
if !doesActuallyMatch {
|
||||
t.Errorf("Pattern %v should match robots regexp but does not.", thisShouldMatch)
|
||||
}
|
||||
if shouldNotMatchButDoes {
|
||||
t.Errorf("Pattern %v should NOT match robots regexp, but it does.", thisShouldNotMatch)
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
type TestCase struct {
|
||||
Name string
|
||||
TestURL string
|
||||
StatusWant int
|
||||
CustomClient http.Client
|
||||
SleepTime time.Duration
|
||||
}
|
||||
|
||||
func (rb TestCase) DoRequestStuff(t *testing.T) (measuredFetchTime time.Duration) {
|
||||
|
||||
testURL := rb.TestURL
|
||||
testURLMsg := fmt.Sprintf("testURL: %v", testURL)
|
||||
verbosityLevelLogPrint(testURLMsg, 5)
|
||||
thisClient := rb.CustomClient
|
||||
timeBeforeGet := time.Now()
|
||||
thisResponse, err := thisClient.Get(testURL)
|
||||
measuredFetchTime = time.Since(timeBeforeGet)
|
||||
statusWant := rb.StatusWant
|
||||
statusGot := thisResponse.StatusCode
|
||||
if err != nil {
|
||||
t.Errorf("Error getting %s: %v", testURL, err)
|
||||
} else if statusWant != statusGot {
|
||||
t.Errorf("Test `%v` should fail: Should have gotten status %v but got %v!", rb.Name, statusWant, statusGot)
|
||||
}
|
||||
fmt.Printf("thisResponse: %v\n", thisResponse)
|
||||
time.Sleep(rb.SleepTime)
|
||||
return
|
||||
}
|
||||
|
||||
func TestMain(t *testing.T) {
|
||||
GLOBAL_VERBOSITY = 5
|
||||
fmt.Println("Running test TestMain")
|
||||
go runSimpleServer(":3000", "TestMain")
|
||||
|
||||
commonRealUserAgent := "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
|
||||
thisTransport := NewEditableUserAgentTransport(commonRealUserAgent)
|
||||
thisClient := http.Client{Transport: &thisTransport}
|
||||
go main()
|
||||
|
||||
//Sleep while main sets up the reverse proxy
|
||||
//Inelegant, but errs on the side of failure
|
||||
//so its okay
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
testCaseNormalUserLike := TestCase{
|
||||
Name: "Normal User test case",
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
testCaseNormalUserLike.DoRequestStuff(t)
|
||||
|
||||
testCaseMaliciousUser := TestCase{
|
||||
Name: "Malicious user test case",
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
numRequestsDone := 0
|
||||
testDuration := time.Duration(0)
|
||||
for i := 0; i < 20; i++ {
|
||||
testDuration += testCaseMaliciousUser.DoRequestStuff(t)
|
||||
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
|
||||
verbosityLevelLogPrint(durationMsg, 5)
|
||||
numRequestsDone++
|
||||
}
|
||||
|
||||
testRate := float64(numRequestsDone) / testDuration.Seconds()
|
||||
totalDurationMsg := fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
|
||||
verbosityLevelLogPrint(totalDurationMsg, 5)
|
||||
if testRate > GLOBAL_RATE_LIMIT+1000 {
|
||||
t.Errorf("MaliciousUser test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
|
||||
}
|
||||
|
||||
sillyFakeUserAgent := "I am 3vil and undetectable 8D"
|
||||
thisTransport = NewEditableUserAgentTransport(sillyFakeUserAgent)
|
||||
thisClient = http.Client{Transport: &thisTransport}
|
||||
|
||||
maxNumRobotsViolations := config.TarpitAfterRecentRobotsTxtViolations
|
||||
desiredRateLimit := config.TarpitAfterRobotsTxtViolationsPerSecond
|
||||
neededSleepTime := 1000.0 / (desiredRateLimit + 1.0)
|
||||
rateLimitMsg := fmt.Sprintf("desiredRateLimit: %v, so sleep time should be %v", desiredRateLimit, neededSleepTime)
|
||||
verbosityLevelLogPrint(rateLimitMsg, 5)
|
||||
|
||||
testCaseRespectfulBotLike := TestCase{
|
||||
Name: "Respectful bot test case",
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 403,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||
}
|
||||
for reqIndex := 0; reqIndex < maxNumRobotsViolations-21; reqIndex++ {
|
||||
testCaseRespectfulBotLike.DoRequestStuff(t)
|
||||
}
|
||||
|
||||
testCaseRespectfulBotPassesThreshold := TestCase{
|
||||
Name: "Test case which should be the first to pass the tarpit threshold.",
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||
}
|
||||
testCaseRespectfulBotPassesThreshold.DoRequestStuff(t)
|
||||
|
||||
testCaseUserLoggingIn := TestCase{
|
||||
Name: "Test case case to check that a user can visit explicitly allowed page",
|
||||
TestURL: "http://localhost:3002/respond303",
|
||||
StatusWant: 303,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
testCaseUserLoggingIn.DoRequestStuff(t)
|
||||
|
||||
testCaseUserAfterLoggedIn := TestCase{
|
||||
Name: "Test case to check for already logged-in user",
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
testCaseUserAfterLoggedIn.DoRequestStuff(t)
|
||||
|
||||
}
|
17
go.mod
17
go.mod
|
@ -1,11 +1,14 @@
|
|||
module git.sequentialread.com/forest/forgejo-crawler-blocker
|
||||
module crawlpit
|
||||
|
||||
go 1.19
|
||||
go 1.21.6
|
||||
|
||||
require (
|
||||
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04 // indirect
|
||||
github.com/boltdb/bolt v1.3.1 // indirect
|
||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect
|
||||
golang.org/x/sys v0.16.0 // indirect
|
||||
zgo.at/isbot v1.0.0 // indirect
|
||||
git.sequentialread.com/forest/config-lite v0.0.0-20220225195944-164dc71bce04
|
||||
github.com/boltdb/bolt v1.3.1
|
||||
zgo.at/isbot v1.0.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c // indirect
|
||||
golang.org/x/sys v0.17.0 // indirect
|
||||
)
|
||||
|
|
4
go.sum
4
go.sum
|
@ -4,7 +4,7 @@ github.com/boltdb/bolt v1.3.1 h1:JQmyP4ZBrce+ZQu0dY660FMfatumYDLun9hBCUVIkF4=
|
|||
github.com/boltdb/bolt v1.3.1/go.mod h1:clJnj/oiGkjum5o1McbSZDSLxVThjynRyGBgiAx27Ps=
|
||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c h1:HelZ2kAFadG0La9d+4htN4HzQ68Bm2iM9qKMSMES6xg=
|
||||
github.com/texttheater/golang-levenshtein/levenshtein v0.0.0-20200805054039-cae8b0eaed6c/go.mod h1:JlzghshsemAMDGZLytTFY8C1JQxQPhnatWqNwUXjggo=
|
||||
golang.org/x/sys v0.16.0 h1:xWw16ngr6ZMtmxDyKyIgsE93KNKz5HKmMa3b8ALHidU=
|
||||
golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
zgo.at/isbot v1.0.0 h1:wM+40WO2LudR5352eCAQlHQdWsdNVpngB8Oe1qlTpdM=
|
||||
zgo.at/isbot v1.0.0/go.mod h1:/w+LTWbHsV7G5fT41VnwM0kTdk1H/prynfMqaxcBWjA=
|
||||
|
|
44
logicalDiagram.mermaid
Normal file
44
logicalDiagram.mermaid
Normal file
|
@ -0,0 +1,44 @@
|
|||
stateDiagram-v2
|
||||
direction TB
|
||||
|
||||
accTitle: crawler-blocker Diagram
|
||||
accDescr: Diagram of main.go
|
||||
|
||||
|
||||
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
|
||||
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
|
||||
|
||||
class nixShellEnvironment nixShell
|
||||
class setConfigSubroutine, nixShellExitSubroutine subroutine
|
||||
class mainFunction systemShell
|
||||
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
|
||||
|
||||
preEntryState --> mainFunction
|
||||
setConfigSubroutine --> setConfigSubroutine
|
||||
setConfigSubroutine --> nixShellEnvironment
|
||||
nixShellEnvironment --> nixShellExitSubroutine
|
||||
mainFunction --> postExitState
|
||||
|
||||
state "Define types. For convenience and clarity." as preEntryState
|
||||
state "Define global variables and constants. These make it easier to pass data between some of the other functions below." as preEntryState
|
||||
state "entrypoint to main" as preEntryState
|
||||
state "main exits and control is returned to the calling environment" as postExitState
|
||||
state "main" as mainFunction {
|
||||
|
||||
state "configureSelf() is called" as callToConfigSelf {
|
||||
|
||||
state "Configuration Subroutine" as setConfigSubroutine
|
||||
state "Import config from file, panic if error occurs" as setConfigSubroutine
|
||||
state "Check that config parameters are within acceptable values. Exit if not." as setConfigSubroutine
|
||||
|
||||
state "DataBase Stat and Initialization Subroutine" as dbStatSubroutine
|
||||
state "stat ./data directory. Create it if it doesn't exist." as dbStatSubroutine
|
||||
}
|
||||
state "call http.HandlerFunc(\"/\", whackyHackyHandlerFunc)" as handleFuncCall
|
||||
state "Register whackyHackyHandlerFunc with pattern / to the DefaultServMux." as handleFuncCall
|
||||
state "The / pattern basically matches everything from the root of the URL path." as handleFuncCall
|
||||
state "So here, all incoming requests should be handled by this function." as handleFuncCall
|
||||
|
||||
}
|
313
main.go
313
main.go
|
@ -21,13 +21,43 @@ import (
|
|||
isbot "zgo.at/isbot"
|
||||
)
|
||||
|
||||
type ResponseCodeReportingTransport struct {
|
||||
UnderlyingTransport http.RoundTripper
|
||||
ReportingChannel chan string
|
||||
}
|
||||
|
||||
var AllowIPChan chan string
|
||||
|
||||
func (t *ResponseCodeReportingTransport) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
|
||||
response, err := t.UnderlyingTransport.RoundTrip(request)
|
||||
if err != nil {
|
||||
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
|
||||
return nil, err
|
||||
}
|
||||
|
||||
verbosityLevelLogPrint("About to check status code for 303 within custom RoundTrip implementation", 5)
|
||||
if response.StatusCode == http.StatusSeeOther {
|
||||
verbosityLevelLogPrint("Found status 303, sending to channel", 5)
|
||||
remoteIPNoPort := request.RemoteAddr[:strings.LastIndex(request.RemoteAddr, ":")]
|
||||
AllowIPChan <- remoteIPNoPort
|
||||
mapMsg := fmt.Sprintf("Sent the following IP to AllowIPChan: %v", remoteIPNoPort)
|
||||
verbosityLevelLogPrint(mapMsg, 5)
|
||||
}
|
||||
verbosityLevelLogPrint("Checked status code for 303.", 5)
|
||||
|
||||
return response, nil
|
||||
}
|
||||
|
||||
type Config struct {
|
||||
UpstreamHostname string
|
||||
UpstreamPort int
|
||||
ListenPort int
|
||||
GlobalRateLimit float64
|
||||
AllowedPaths []string
|
||||
GlobalVerbosityLevel int
|
||||
TarpitAfterRecentRobotsTxtViolations int
|
||||
TarpitAfterRobotsTxtViolationsPerSecond float64
|
||||
HardBlockBasedOnRobotsTxt bool
|
||||
WhitelistUserAgentRegexes []string
|
||||
BlockedUserAgentRegexes []string
|
||||
BlockedSourceIpPrefixes []BlockedSourceIpPrefix
|
||||
|
@ -40,22 +70,63 @@ type BlockedSourceIpPrefix struct {
|
|||
}
|
||||
|
||||
type RequestsLog struct {
|
||||
AllowEverywhere bool `json:allow_everywhere`
|
||||
TotalRequests int64 `json:total_requests`
|
||||
NumAveragedRequests int64 `json:averaged_requests`
|
||||
RecentRobotsTxtViolations int64 `json:recent_requests`
|
||||
RollingAvgRequestRate float64 `json:rolling_average_request_rate`
|
||||
TimeOfLastRequest time.Time `json:time_of_last_request`
|
||||
TimeOfFirstRequest time.Time `json:time_of_first_request`
|
||||
Blocked time.Time `json:time_of_blocked`
|
||||
}
|
||||
|
||||
type LoggedInMap map[string]time.Time
|
||||
|
||||
var BlockedUserAgentRegexes []*regexp.Regexp
|
||||
var WhitelistUserAgentRegexes []*regexp.Regexp
|
||||
var ExplicitlyAllowedPaths []*regexp.Regexp
|
||||
var robotsTxtDisallow []*regexp.Regexp
|
||||
var db *bolt.DB
|
||||
var config Config
|
||||
var GLOBAL_VERBOSITY int
|
||||
var GLOBAL_RATE_LIMIT float64 //in requests per second
|
||||
|
||||
const MyBlockedSourceIpPrefixIsBotReasonOffset = 40
|
||||
|
||||
func main() {
|
||||
|
||||
AllowIPChan = make(chan string, 10)
|
||||
|
||||
verbosityLevelLogPrint(fmt.Sprintf("GLOBAL_VERBOSITY set to: %v", GLOBAL_VERBOSITY), 5)
|
||||
|
||||
verbosityLevelLogPrint("Before call to configureSelf()", 5)
|
||||
configureSelf()
|
||||
verbosityLevelLogPrint("After call to configureSelf()", 5)
|
||||
|
||||
verbosityLevelLogPrint("Before opening database", 5)
|
||||
var err error
|
||||
db, err = bolt.Open("data/traffic.db", 0600, nil)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
verbosityLevelLogPrint("After opening database", 5)
|
||||
|
||||
verbosityLevelLogPrint("Before call to getRobotsDotTxt()", 5)
|
||||
robotsTxtDisallow = getRobotsDotTxt()
|
||||
verbosityLevelLogPrint("After call to getRobotsDotTxt()", 5)
|
||||
|
||||
verbosityLevelLogPrint("Registering whackyHackyHandlerFunc as root handler", 5)
|
||||
http.HandleFunc("/", whackyHackyHandlerFunc)
|
||||
|
||||
listenMessage := fmt.Sprintf("im about to try to listen on port %d and forward to http://%s:%d!\n",
|
||||
config.ListenPort, config.UpstreamHostname, config.UpstreamPort)
|
||||
verbosityLevelLogPrint(listenMessage, 1)
|
||||
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", config.ListenPort), nil))
|
||||
}
|
||||
|
||||
func configureSelf() {
|
||||
|
||||
config = Config{}
|
||||
ignoreCommandlineFlags := []string{}
|
||||
err := configlite.ReadConfiguration("config.json", "FCB", ignoreCommandlineFlags, reflect.ValueOf(&config))
|
||||
|
@ -66,11 +137,14 @@ func main() {
|
|||
configBytes, _ := json.MarshalIndent(config, "", " ")
|
||||
log.Println(string(configBytes))
|
||||
|
||||
GLOBAL_VERBOSITY = config.GlobalVerbosityLevel
|
||||
GLOBAL_RATE_LIMIT = config.GlobalRateLimit
|
||||
|
||||
if config.TarpitAfterRecentRobotsTxtViolations < 10 {
|
||||
log.Fatalf("TarpitAfterRecentRobotsTxtViolations (%d) must be greater than 10", config.TarpitAfterRecentRobotsTxtViolations)
|
||||
}
|
||||
if config.TarpitAfterRobotsTxtViolationsPerSecond < 0.001 {
|
||||
log.Fatalf("TarpitAfterRobotsTxtViolationsPerSecond (%d) must be greater than 0.001", config.TarpitAfterRobotsTxtViolationsPerSecond)
|
||||
log.Fatalf("TarpitAfterRobotsTxtViolationsPerSecond (%f) must be greater than 0.001", config.TarpitAfterRobotsTxtViolationsPerSecond)
|
||||
}
|
||||
|
||||
_, err = os.Stat("data")
|
||||
|
@ -82,12 +156,6 @@ func main() {
|
|||
}
|
||||
}
|
||||
|
||||
db, err = bolt.Open("data/traffic.db", 0600, nil)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
BlockedUserAgentRegexes = []*regexp.Regexp{}
|
||||
for _, r := range config.BlockedUserAgentRegexes {
|
||||
BlockedUserAgentRegexes = append(BlockedUserAgentRegexes, regexp.MustCompile(r))
|
||||
|
@ -98,43 +166,55 @@ func main() {
|
|||
WhitelistUserAgentRegexes = append(WhitelistUserAgentRegexes, regexp.MustCompile(r))
|
||||
}
|
||||
|
||||
ExplicitlyAllowedPaths = []*regexp.Regexp{}
|
||||
for _, pathPattern := range config.AllowedPaths {
|
||||
ExplicitlyAllowedPaths = append(ExplicitlyAllowedPaths, regexp.MustCompile(pathPattern))
|
||||
}
|
||||
}
|
||||
|
||||
func lookupHostname(hostname string) string {
|
||||
|
||||
dnsCacheTTL := time.Second * 5
|
||||
lastDnsResolution := time.Unix(0, 0)
|
||||
backendIPAddreses := []net.IP{}
|
||||
lookupHostname := func(hostname string) string {
|
||||
|
||||
if time.Since(lastDnsResolution) > dnsCacheTTL {
|
||||
var err error
|
||||
backendIPAddreses, err = net.LookupIP(hostname)
|
||||
if err != nil {
|
||||
log.Printf("DNS resolution of hostname (%s) failed!! error: %s", hostname, err)
|
||||
}
|
||||
checkErrWarn(err, fmt.Sprintf("DNS resolution of hostname (%s) failed!! error: %s", hostname, err), 1)
|
||||
lastDnsResolution = time.Now()
|
||||
}
|
||||
if backendIPAddreses != nil && len(backendIPAddreses) != 0 {
|
||||
return backendIPAddreses[rand.Intn(len(backendIPAddreses))].String()
|
||||
}
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
reverseProxyInstance := &httputil.ReverseProxy{
|
||||
Director: func(request *http.Request) {
|
||||
func reverseProxyDirectorFunction(request *http.Request) {
|
||||
ip := lookupHostname(config.UpstreamHostname)
|
||||
address := fmt.Sprintf("%s:%d", ip, config.UpstreamPort)
|
||||
request.Host = address
|
||||
request.URL.Scheme = "http"
|
||||
request.URL.Host = address
|
||||
log.Printf("connecting to %s...\n", address)
|
||||
verbosityLevelLogPrint(fmt.Sprintf("connecting to %s...\n", address), 1)
|
||||
}
|
||||
|
||||
// symys to forest: I put this handler into its own named function,
|
||||
// mainly because a work of art deserves a title
|
||||
func whackyHackyHandlerFunc(responseWriter http.ResponseWriter, request *http.Request) {
|
||||
|
||||
verbosityLevelLogPrint("whackyHackyHandlerFunc called", 5)
|
||||
|
||||
reverseProxyInstance := httputil.ReverseProxy{
|
||||
Director: reverseProxyDirectorFunction,
|
||||
Transport: &ResponseCodeReportingTransport{
|
||||
UnderlyingTransport: http.DefaultTransport,
|
||||
},
|
||||
Transport: http.DefaultTransport,
|
||||
}
|
||||
|
||||
//robotsTxtDisallow := []regexp.Regexp{}
|
||||
robotsTxtDisallow := getRobotsDotTxt()
|
||||
|
||||
http.HandleFunc("/", func(responseWriter http.ResponseWriter, request *http.Request) {
|
||||
|
||||
remoteIp := getRemoteIP(request)
|
||||
userAgent := request.Header.Get("User-Agent")
|
||||
verbosityLevelLogPrint(fmt.Sprintf("userAgnt from whackyHackyHandlerFunc: %v", userAgent), 5)
|
||||
isBotResult := isbot.Bot(request)
|
||||
if isbot.Is(isBotResult) {
|
||||
for _, r := range WhitelistUserAgentRegexes {
|
||||
|
@ -152,13 +232,46 @@ func main() {
|
|||
|
||||
botsAllowed := true
|
||||
for _, r := range robotsTxtDisallow {
|
||||
thisRegexpMsg := fmt.Sprintf("Checking URL pattern: %v against regexp: %v", request.URL.Path, r)
|
||||
verbosityLevelLogPrint(thisRegexpMsg, 5)
|
||||
if r.MatchString(request.URL.Path) {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("Request matched disallowed pattern: %v", r), 5)
|
||||
botsAllowed = false
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
blockedTime, count := incrementTrafficDB(remoteIp, request.URL.Path, isbot.Is(isBotResult), botsAllowed)
|
||||
botsException := false
|
||||
for _, r := range ExplicitlyAllowedPaths {
|
||||
thisRegexpMsg := fmt.Sprintf("Checking URL pattern: %v against regexp: %v", request.URL.Path, r)
|
||||
verbosityLevelLogPrint(thisRegexpMsg, 5)
|
||||
if r.MatchString(request.URL.Path) {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("Request matched allowed pattern: %v", r), 5)
|
||||
botsException = true
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
blockedTime, count, sleepTimeNeeded, allowedEverywhere := incrementTrafficDB(remoteIp, request.URL.Path, isbot.Is(isBotResult), botsAllowed)
|
||||
|
||||
if allowedEverywhere {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("%v is allowed everywhere as it was previously served a 303.", remoteIp), 5)
|
||||
reverseProxyInstance.ServeHTTP(responseWriter, request)
|
||||
return
|
||||
}
|
||||
|
||||
if botsException || botsAllowed || isbot.IsNot(isBotResult) {
|
||||
sleepyTime := time.Duration(sleepTimeNeeded*1000) * time.Millisecond
|
||||
if sleepyTime > 0 {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("Need to sleep for: %v , to comply with rate limit", sleepyTime), 5)
|
||||
time.Sleep(sleepyTime)
|
||||
} else {
|
||||
verbosityLevelLogPrint("Request looks fine, but not logged in, so subject to rate limit. It's staying within the rate limit on its own.", 4)
|
||||
}
|
||||
reverseProxyInstance.ServeHTTP(responseWriter, request)
|
||||
return
|
||||
}
|
||||
|
||||
hoursInAMonth := float64(24 * 30)
|
||||
blocked := time.Since(blockedTime).Hours() < hoursInAMonth
|
||||
|
||||
|
@ -168,16 +281,20 @@ func main() {
|
|||
)
|
||||
|
||||
if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" {
|
||||
tarpitLogMsg := fmt.Sprintf("Tarpit activated for %v", remoteIp)
|
||||
verbosityLevelLogPrint(tarpitLogMsg, 2)
|
||||
bible, err := os.Open("bible.txt")
|
||||
if err != nil {
|
||||
http.Error(responseWriter, `1:0 Welcome to our bad bot behaviour management tarpit.
|
||||
You are here because you requested over 101 "below-surface-level" pages
|
||||
in under 16 minutes. Your IP address will be blocked for 1 month.
|
||||
If you wish to have it unblocked, please email ops@cyberia.club`, http.StatusForbidden)
|
||||
If you wish to have it unblocked, please email ops@cyberia.club`, http.StatusTeapot)
|
||||
return
|
||||
}
|
||||
|
||||
responseWriter.Header().Set("Content-Type", "text/html")
|
||||
//Uncomment to give the bots a little heads-up
|
||||
//responseWriter.WriteHeader(http.StatusTeapot)
|
||||
|
||||
fmt.Fprint(responseWriter, `<!DOCTYPE HTML>
|
||||
<html lang="en">
|
||||
|
@ -211,34 +328,26 @@ func main() {
|
|||
return
|
||||
}
|
||||
|
||||
// if blocked || strings.Trim(request.URL.Path, "/") == "bible.txt" {
|
||||
// http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
|
||||
// return
|
||||
// }
|
||||
|
||||
if config.HardBlockBasedOnRobotsTxt && !botsAllowed && isbot.Is(isBotResult) {
|
||||
if !botsAllowed && isbot.Is(isBotResult) {
|
||||
http.Error(responseWriter, "403 Forbidden: You look like a bot and this path is disallowed by robots.txt", http.StatusForbidden)
|
||||
return
|
||||
}
|
||||
|
||||
if botsAllowed || isbot.IsNot(isBotResult) {
|
||||
reverseProxyInstance.ServeHTTP(responseWriter, request)
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
log.Printf(
|
||||
"im about to try to listen on port %d and forward to http://%s:%d!\n",
|
||||
config.ListenPort, config.UpstreamHostname, config.UpstreamPort,
|
||||
)
|
||||
log.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", config.ListenPort), nil))
|
||||
}
|
||||
|
||||
func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (time.Time, int64) {
|
||||
type LogUpdate struct {
|
||||
ForIP string
|
||||
AllowSignal bool
|
||||
RequestPath string
|
||||
BotsAllowed bool
|
||||
SleepTimeNeededReturn float64
|
||||
BlockedReturn time.Time
|
||||
RecentRobotsViolationsReturn int64
|
||||
AllowedEverywhereReturn bool
|
||||
}
|
||||
|
||||
toReturn := time.Time{}
|
||||
toReturn2 := int64(0)
|
||||
err := db.Update(func(tx *bolt.Tx) error {
|
||||
func (logUpdateIn *LogUpdate) UpdateTheLog(tx *bolt.Tx) error {
|
||||
verbosityLevelLogPrint("calling db.Update", 5)
|
||||
bucket, err := tx.CreateBucketIfNotExists([]byte("ip"))
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -248,14 +357,14 @@ func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (
|
|||
return err
|
||||
}
|
||||
|
||||
ipKey := []byte(fmt.Sprintf("%s", remoteIp))
|
||||
ipKey := []byte(fmt.Sprintf("%s", logUpdateIn.ForIP))
|
||||
ipBytes := bucket.Get(ipKey)
|
||||
ip := RequestsLog{}
|
||||
if ipBytes != nil {
|
||||
json.Unmarshal(ipBytes, &ip)
|
||||
}
|
||||
|
||||
ipPathKey := []byte(fmt.Sprintf("%s_%s", remoteIp, requestPath))
|
||||
ipPathKey := []byte(fmt.Sprintf("%s_%s", logUpdateIn.ForIP, logUpdateIn.RequestPath))
|
||||
ipPathBytes := ipPathBucket.Get(ipPathKey)
|
||||
totalVisits := uint64(0)
|
||||
if ipPathBytes != nil {
|
||||
|
@ -266,21 +375,35 @@ func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (
|
|||
secondsSinceLastRequest := time.Since(ip.TimeOfLastRequest).Seconds()
|
||||
ip.TimeOfLastRequest = time.Now()
|
||||
if secondsSinceLastRequest > ip.RollingAvgRequestRate {
|
||||
ip.TimeOfFirstRequest = time.Now()
|
||||
ip.NumAveragedRequests = 1
|
||||
ip.RollingAvgRequestRate = 0
|
||||
ip.RecentRobotsTxtViolations = 0
|
||||
} else {
|
||||
ip.RollingAvgRequestRate -= secondsSinceLastRequest
|
||||
ip.NumAveragedRequests += 1
|
||||
ip.RollingAvgRequestRate = float64(ip.NumAveragedRequests) / (time.Since(ip.TimeOfFirstRequest).Seconds())
|
||||
}
|
||||
logUpdateIn.SleepTimeNeededReturn = 0 //0.01*float64(runtime.NumGoroutine())*float64(ip.NumAveragedRequests)/GLOBAL_RATE_LIMIT - time.Since(ip.TimeOfFirstRequest).Seconds()
|
||||
|
||||
ip.TotalRequests += 1
|
||||
if botsAllowed == false {
|
||||
if logUpdateIn.BotsAllowed == false {
|
||||
ip.RecentRobotsTxtViolations += 1
|
||||
}
|
||||
|
||||
ip.RollingAvgRequestRate += float64(1)
|
||||
if ip.RecentRobotsTxtViolations > int64(config.TarpitAfterRecentRobotsTxtViolations) && ip.RollingAvgRequestRate > config.TarpitAfterRobotsTxtViolationsPerSecond {
|
||||
ip.Blocked = time.Now()
|
||||
}
|
||||
toReturn = ip.Blocked
|
||||
toReturn2 = ip.RecentRobotsTxtViolations
|
||||
rollingAvgMsg := fmt.Sprintf("ip.RollingAvgRequestRate within incrementTrafficDB: %v", ip.RollingAvgRequestRate)
|
||||
verbosityLevelLogPrint(rollingAvgMsg, 5)
|
||||
robotsViolationsMsg := fmt.Sprintf("ip.RecentRobotsTxtViolations within incrementTrafficDB: %v", ip.RecentRobotsTxtViolations)
|
||||
verbosityLevelLogPrint(robotsViolationsMsg, 5)
|
||||
logUpdateIn.BlockedReturn = ip.Blocked
|
||||
logUpdateIn.RecentRobotsViolationsReturn = ip.RecentRobotsTxtViolations
|
||||
if logUpdateIn.AllowSignal {
|
||||
ip.AllowEverywhere = true
|
||||
}
|
||||
logUpdateIn.AllowedEverywhereReturn = ip.AllowEverywhere
|
||||
|
||||
buffer := make([]byte, 8)
|
||||
binary.LittleEndian.PutUint64(buffer, totalVisits)
|
||||
|
@ -296,15 +419,59 @@ func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (
|
|||
}
|
||||
|
||||
return ipPathBucket.Put(ipPathKey, buffer)
|
||||
})
|
||||
}
|
||||
|
||||
func incrementTrafficDB(remoteIp, requestPath string, isBot, botsAllowed bool) (blockedReturn time.Time, recentRobotsViolationsReturn int64, sleepTimeNeededReturn float64, allowedEverywhereReturn bool) {
|
||||
|
||||
recentRobotsViolationsReturn = int64(0)
|
||||
preErrMsg := fmt.Sprintf("incrementTrafficDB called with params (%v, %v, %v, %v)", remoteIp, requestPath, isBot, botsAllowed)
|
||||
verbosityLevelLogPrint(preErrMsg, 5)
|
||||
|
||||
select {
|
||||
case ipSignaledAllow := <-AllowIPChan:
|
||||
verbosityLevelLogPrint("Consumed AllowIPChan value", 5)
|
||||
allowedLogUpdate := LogUpdate{
|
||||
ForIP: ipSignaledAllow,
|
||||
AllowSignal: true,
|
||||
RequestPath: "/",
|
||||
BotsAllowed: true,
|
||||
}
|
||||
allowedLogMsg := fmt.Sprintf("allowedLogUpdate in ipSignaledAllow case, before update: %v", allowedLogUpdate)
|
||||
verbosityLevelLogPrint(allowedLogMsg, 5)
|
||||
err := db.Update(allowedLogUpdate.UpdateTheLog)
|
||||
if err != nil {
|
||||
log.Printf("boltdb error: %v\n", err)
|
||||
}
|
||||
postUpdateMsg := fmt.Sprintf("allowedLogUpdate in ipSignaledAllow case, after update: %v", allowedLogUpdate)
|
||||
verbosityLevelLogPrint(postUpdateMsg, 5)
|
||||
default:
|
||||
verbosityLevelLogPrint("Default case: no AllowIPChan value to consume", 5)
|
||||
|
||||
return toReturn, toReturn2
|
||||
}
|
||||
|
||||
thisLogUpdate := LogUpdate{
|
||||
ForIP: remoteIp,
|
||||
AllowSignal: false,
|
||||
RequestPath: requestPath,
|
||||
BotsAllowed: botsAllowed,
|
||||
}
|
||||
err := db.Update(thisLogUpdate.UpdateTheLog)
|
||||
if err != nil {
|
||||
log.Printf("boltdb error: %v\n", err)
|
||||
} else {
|
||||
|
||||
sleepTimeNeededReturn = thisLogUpdate.SleepTimeNeededReturn
|
||||
blockedReturn = thisLogUpdate.BlockedReturn
|
||||
recentRobotsViolationsReturn = thisLogUpdate.RecentRobotsViolationsReturn
|
||||
allowedEverywhereReturn = thisLogUpdate.AllowedEverywhereReturn
|
||||
}
|
||||
incrementTrafficReturnDebugMsg := fmt.Sprintf("allowedEveryWhereReturn just before returning incrementTrafficDB: %v", allowedEverywhereReturn)
|
||||
verbosityLevelLogPrint(incrementTrafficReturnDebugMsg, 5)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func getRobotsDotTxt() []*regexp.Regexp {
|
||||
func getRobotsDotTxt() (robotsDisallowed []*regexp.Regexp) {
|
||||
robotsTxtResponse, err := http.Get(fmt.Sprintf("http://%s:%d/robots.txt", config.UpstreamHostname, config.UpstreamPort))
|
||||
if err != nil {
|
||||
log.Fatalf(
|
||||
|
@ -328,7 +495,7 @@ func getRobotsDotTxt() []*regexp.Regexp {
|
|||
txtStr := string(txtBytes)
|
||||
txtLines := strings.Split(txtStr, "\n")
|
||||
|
||||
robotsTxtDisallow := []*regexp.Regexp{}
|
||||
robotsDisallowed = []*regexp.Regexp{}
|
||||
for _, line := range txtLines {
|
||||
line = strings.ToLower(line)
|
||||
if strings.HasPrefix(line, "disallow:") {
|
||||
|
@ -337,17 +504,20 @@ func getRobotsDotTxt() []*regexp.Regexp {
|
|||
expression = strings.ReplaceAll(expression, "*", ".*")
|
||||
expression = "^" + expression + ".*"
|
||||
|
||||
log.Printf("robotsTxtDisallow: %s\n", expression)
|
||||
robotsTxtDisallow = append(robotsTxtDisallow, regexp.MustCompile(expression))
|
||||
log.Printf("robotsDisallowed: %s\n", expression)
|
||||
robotsDisallowed = append(robotsDisallowed, regexp.MustCompile(expression))
|
||||
}
|
||||
}
|
||||
return robotsTxtDisallow
|
||||
return
|
||||
}
|
||||
|
||||
func getRemoteIP(request *http.Request) string {
|
||||
toReturn := request.Header.Get("X-Forwarded-For")
|
||||
if toReturn == "" {
|
||||
toReturn = strings.Split(request.RemoteAddr, ":")[0]
|
||||
//LastIndex is needed rather than strings.Split
|
||||
//so this works on IPv6 addresses
|
||||
portColonInd := strings.LastIndex(request.RemoteAddr, ":")
|
||||
toReturn = request.RemoteAddr[:portColonInd]
|
||||
}
|
||||
return toReturn
|
||||
}
|
||||
|
@ -356,6 +526,7 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
|
|||
|
||||
for i, prefix := range config.BlockedSourceIpPrefixes {
|
||||
if strings.HasPrefix(remoteIP, prefix.Prefix) {
|
||||
verbosityLevelLogPrint("Judged to be bot in myIsBot based on BlockedSourceIpPrefixes", 5)
|
||||
return isbot.Result(MyBlockedSourceIpPrefixIsBotReasonOffset + i)
|
||||
}
|
||||
}
|
||||
|
@ -363,6 +534,7 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
|
|||
|
||||
for _, r := range BlockedUserAgentRegexes {
|
||||
if r.MatchString(userAgent) {
|
||||
verbosityLevelLogPrint("Judged to be bot based in myIsBot based on BlockedUserAgentRegexes", 5)
|
||||
return isbot.BotClientLibrary
|
||||
}
|
||||
}
|
||||
|
@ -371,9 +543,11 @@ func myIsBot(remoteIP string, userAgent string) isbot.Result {
|
|||
|
||||
func getIsBotReason(code isbot.Result) string {
|
||||
|
||||
thisCodeAsUint8 := uint8(code)
|
||||
|
||||
myReasonMap := map[uint8]string{
|
||||
0: "Known to not be a bot",
|
||||
1: "None of the rules matches, so probably not a bot",
|
||||
1: "None of the rules matches, so we assume non-bot",
|
||||
2: "Prefetch algorithm",
|
||||
3: "User-Agent appeared to contain a URL",
|
||||
4: "Known client library",
|
||||
|
@ -395,5 +569,22 @@ func getIsBotReason(code isbot.Result) string {
|
|||
myReasonMap[uint8(MyBlockedSourceIpPrefixIsBotReasonOffset+i)] = prefix.ShortDesc
|
||||
}
|
||||
|
||||
return myReasonMap[uint8(code)]
|
||||
verbosityLevelLogPrint(fmt.Sprintf("reason fount by getIsBotReason: %v", myReasonMap[thisCodeAsUint8]), 5)
|
||||
return myReasonMap[thisCodeAsUint8]
|
||||
}
|
||||
|
||||
// A couple of helper functions, mainly for debugging
|
||||
func checkErrWarn(err error, msg string, verbosityLevel int) {
|
||||
if err != nil {
|
||||
if verbosityLevel <= GLOBAL_VERBOSITY {
|
||||
log.Printf("%s: %v\n", msg, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func verbosityLevelLogPrint(msg string, verbosityLevel int) {
|
||||
if verbosityLevel <= GLOBAL_VERBOSITY {
|
||||
log.Printf("%s\n", msg)
|
||||
}
|
||||
|
||||
}
|
||||
|
|
43
mainDiagram.mermaid
Normal file
43
mainDiagram.mermaid
Normal file
|
@ -0,0 +1,43 @@
|
|||
stateDiagram-v2
|
||||
direction TB
|
||||
|
||||
accTitle: crawler-blocker Diagram
|
||||
accDescr: Diagram of main.go
|
||||
|
||||
|
||||
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
|
||||
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
|
||||
|
||||
class nixShellEnvironment nixShell
|
||||
class setConfigSubroutine, nixShellExitSubroutine subroutine
|
||||
class mainFunction systemShell
|
||||
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
|
||||
|
||||
preEntryState --> mainFunction
|
||||
setConfigSubroutine --> dbStatSubroutine
|
||||
callToConfigSelf --> handleFuncCall
|
||||
mainFunction --> postExitState
|
||||
|
||||
state "Define types. For convenience and clarity." as preEntryState
|
||||
state "Define global variables and constants. These make it easier to pass data between some of the other functions below." as preEntryState
|
||||
state "entrypoint to main" as preEntryState
|
||||
state "main exits and control is returned to the calling environment" as postExitState
|
||||
state "main" as mainFunction {
|
||||
|
||||
state "configureSelf() is called" as callToConfigSelf {
|
||||
|
||||
state "Configuration Subroutine" as setConfigSubroutine
|
||||
state "Import config from file, panic if error occurs" as setConfigSubroutine
|
||||
state "Check that config parameters are within acceptable values. Exit if not." as setConfigSubroutine
|
||||
|
||||
state "DataBase Stat and Initialization Subroutine" as dbStatSubroutine
|
||||
state "stat ./data directory. Create it if it doesn't exist." as dbStatSubroutine
|
||||
}
|
||||
state "call to http.HandlerFunc" as handleFuncCall
|
||||
state "Register whackyHackyHandlerFunc with pattern / to the DefaultServMux." as handleFuncCall
|
||||
state "The / pattern basically matches everything from the root of the URL path." as handleFuncCall
|
||||
state "So here, all incoming requests should be handled by this function." as handleFuncCall
|
||||
|
||||
}
|
BIN
pkgnamezzz
Executable file
BIN
pkgnamezzz
Executable file
Binary file not shown.
214
pkgnamezzz_test.go
Normal file
214
pkgnamezzz_test.go
Normal file
|
@ -0,0 +1,214 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"net/http"
|
||||
"testing"
|
||||
"time"
|
||||
)
|
||||
|
||||
type EditableUserAgentTransport struct {
|
||||
CustomUserAgent string
|
||||
UnderlyingTransport http.RoundTripper
|
||||
}
|
||||
|
||||
func NewEditableUserAgentTransport(desiredUserAgent string) EditableUserAgentTransport {
|
||||
transportOut := EditableUserAgentTransport{
|
||||
CustomUserAgent: desiredUserAgent,
|
||||
UnderlyingTransport: http.DefaultTransport,
|
||||
}
|
||||
return transportOut
|
||||
}
|
||||
|
||||
func (t *EditableUserAgentTransport) RoundTrip(request *http.Request) (*http.Response, error) {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("roundtripping %s...", request.URL), 5)
|
||||
request.Header["User-Agent"] = []string{t.CustomUserAgent}
|
||||
response, err := t.UnderlyingTransport.RoundTrip(request)
|
||||
if err != nil {
|
||||
log.Printf("contacting %s (%s) failed: %s", request.RequestURI, request.URL.String(), err)
|
||||
return nil, err
|
||||
}
|
||||
return response, nil
|
||||
}
|
||||
|
||||
/*
|
||||
func TestGetRobotsDotTxt(t *testing.T) {
|
||||
fmt.Println("Running test TestGetRobotsDotTxt")
|
||||
//It's assumed configureSelf() will take longer than setting up
|
||||
//the server in runSimpleServer(), so no additional logic has been
|
||||
//added to ensure the server is running before attempting to use it
|
||||
//Should be fine, as it errs on the side of failure anyway.
|
||||
go runSimpleServer(":3000", "TestGetRobotsDotTxt")
|
||||
configureSelf()
|
||||
|
||||
thisRobots := getRobotsDotTxt()
|
||||
robotsMsg := fmt.Sprintf("robots.txt: %v\n", thisRobots)
|
||||
log.Printf(robotsMsg)
|
||||
thisShouldMatch := "http://localhost:3002/respond200"
|
||||
thisShouldNotMatch := "https://localhost:3002/respond303"
|
||||
|
||||
doesActuallyMatch := false
|
||||
shouldNotMatchButDoes := false
|
||||
for _, r := range thisRobots {
|
||||
if r.MatchString(thisShouldMatch) {
|
||||
doesActuallyMatch = true
|
||||
}
|
||||
if r.MatchString(thisShouldNotMatch) {
|
||||
shouldNotMatchButDoes = true
|
||||
}
|
||||
}
|
||||
if !doesActuallyMatch {
|
||||
t.Errorf("Pattern %v should match robots regexp but does not.", thisShouldMatch)
|
||||
}
|
||||
if shouldNotMatchButDoes {
|
||||
t.Errorf("Pattern %v should NOT match robots regexp, but it does.", thisShouldNotMatch)
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
type ReqBoxForTest struct {
|
||||
TestURL string
|
||||
StatusWant int
|
||||
CustomClient http.Client
|
||||
SleepTime time.Duration
|
||||
}
|
||||
|
||||
func (rb ReqBoxForTest) DoRequestStuff(t *testing.T) (measuredFetchTime time.Duration) {
|
||||
|
||||
testURL := rb.TestURL
|
||||
testURLMsg := fmt.Sprintf("testURL: %v", testURL)
|
||||
verbosityLevelLogPrint(testURLMsg, 5)
|
||||
thisClient := rb.CustomClient
|
||||
timeBeforeGet := time.Now()
|
||||
thisResponse, err := thisClient.Get(testURL)
|
||||
measuredFetchTime = time.Since(timeBeforeGet)
|
||||
statusWant := rb.StatusWant
|
||||
statusGot := thisResponse.StatusCode
|
||||
if err != nil {
|
||||
t.Errorf("Error getting %s: %v", testURL, err)
|
||||
} else if statusWant != statusGot {
|
||||
t.Errorf("Test should fail: Should have gotten status %v but got %v!", statusWant, statusGot)
|
||||
}
|
||||
fmt.Printf("thisResponse: %v\n", thisResponse)
|
||||
time.Sleep(rb.SleepTime)
|
||||
return
|
||||
}
|
||||
|
||||
func TestMain(t *testing.T) {
|
||||
GLOBAL_VERBOSITY = 5
|
||||
fmt.Println("Running test TestMain")
|
||||
go runSimpleServer(":3000", "TestMain")
|
||||
|
||||
commonRealUserAgent := "Mozilla/5.0 (X11; Linux x86_64; rv:122.0) Gecko/20100101 Firefox/122.0"
|
||||
thisTransport := NewEditableUserAgentTransport(commonRealUserAgent)
|
||||
thisClient := http.Client{Transport: &thisTransport}
|
||||
go main()
|
||||
|
||||
//Sleep while main sets up the reverse proxy
|
||||
//Inelegant, but errs on the side of failure
|
||||
//so its okay
|
||||
time.Sleep(1 * time.Second)
|
||||
|
||||
testCaseNormalUserLike := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
testCaseNormalUserLike.DoRequestStuff(t)
|
||||
|
||||
testCaseMaliciousUser := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
numRequestsDone := 0
|
||||
testDuration := time.Duration(0)
|
||||
for i := 0; i < 20; i++ {
|
||||
testDuration += testCaseMaliciousUser.DoRequestStuff(t)
|
||||
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
|
||||
verbosityLevelLogPrint(durationMsg, 5)
|
||||
numRequestsDone++
|
||||
}
|
||||
|
||||
testRate := float64(numRequestsDone) / testDuration.Seconds()
|
||||
totalDurationMsg := fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
|
||||
verbosityLevelLogPrint(totalDurationMsg, 5)
|
||||
if testRate > GLOBAL_RATE_LIMIT+1 {
|
||||
t.Errorf("MaliciousUser test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
|
||||
}
|
||||
|
||||
sillyFakeUserAgent := "I am 3vil and undetectable 8D"
|
||||
thisTransport = NewEditableUserAgentTransport(sillyFakeUserAgent)
|
||||
thisClient = http.Client{Transport: &thisTransport}
|
||||
|
||||
maxNumRobotsViolations := config.TarpitAfterRecentRobotsTxtViolations
|
||||
desiredRateLimit := config.TarpitAfterRobotsTxtViolationsPerSecond
|
||||
neededSleepTime := 1000.0 / (desiredRateLimit + 1.0)
|
||||
rateLimitMsg := fmt.Sprintf("desiredRateLimit: %v, so sleep time should be %v", desiredRateLimit, neededSleepTime)
|
||||
verbosityLevelLogPrint(rateLimitMsg, 5)
|
||||
|
||||
testCaseDumbBotLike := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 403,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||
}
|
||||
for reqIndex := 0; reqIndex < maxNumRobotsViolations-21; reqIndex++ {
|
||||
testCaseDumbBotLike.DoRequestStuff(t)
|
||||
}
|
||||
|
||||
testCaseDumbBotPassesThreshold := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: time.Duration(neededSleepTime) * time.Millisecond,
|
||||
}
|
||||
testCaseDumbBotPassesThreshold.DoRequestStuff(t)
|
||||
|
||||
testCasePushRateLimit := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond200robotsallowed",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
numRequestsDone = 0
|
||||
testDuration = time.Duration(0)
|
||||
for i := 0; i < 20; i++ {
|
||||
testDuration += testCasePushRateLimit.DoRequestStuff(t)
|
||||
durationMsg := fmt.Sprintf("test iteration %v had duration %v", i, testDuration)
|
||||
verbosityLevelLogPrint(durationMsg, 5)
|
||||
numRequestsDone++
|
||||
}
|
||||
|
||||
testRate = float64(numRequestsDone) / testDuration.Seconds()
|
||||
totalDurationMsg = fmt.Sprintf("Rate limit test completed %v test requests in %v seconds", numRequestsDone, testDuration.Seconds())
|
||||
verbosityLevelLogPrint(totalDurationMsg, 5)
|
||||
if testRate > GLOBAL_RATE_LIMIT+1 {
|
||||
t.Errorf("Test should fail: reverse proxy failed to enforce global rate limit of %v. Tested average request rate was: %v", GLOBAL_RATE_LIMIT, testRate)
|
||||
}
|
||||
|
||||
testCaseUserLoggingIn := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond303",
|
||||
StatusWant: 303,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
testCaseUserLoggingIn.DoRequestStuff(t)
|
||||
|
||||
testCaseUserAfterLoggedIn := ReqBoxForTest{
|
||||
TestURL: "http://localhost:3002/respond200",
|
||||
StatusWant: 200,
|
||||
CustomClient: thisClient,
|
||||
SleepTime: 0 * time.Second,
|
||||
}
|
||||
|
||||
testCaseUserAfterLoggedIn.DoRequestStuff(t)
|
||||
|
||||
}
|
16
rolling_avg_calculation.txt
Normal file
16
rolling_avg_calculation.txt
Normal file
|
@ -0,0 +1,16 @@
|
|||
requestRate = requests/second
|
||||
avgRequestRate = numberOfRequests/TotalTimeLoggingRequests
|
||||
|
||||
to be within limit, need:
|
||||
avgRequestRate <= globalLimit
|
||||
or:
|
||||
numRequests/TotalTime <= globalLimit
|
||||
numRequests/globalLimit <= TotalTime
|
||||
|
||||
We can control it with sleep time, so
|
||||
|
||||
numRequests/globalLimit <= TotalTime + SleepTime
|
||||
so
|
||||
numRequests/globalLimit - TotalTime <= SleepTime
|
||||
|
||||
We'll go with equal here, but we could also do a randrange with the left hand side as the minimum.
|
70
simpleServer.go
Normal file
70
simpleServer.go
Normal file
|
@ -0,0 +1,70 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func rootHandler(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
data := "Any page other than /respond303 or /respond200 should end up here"
|
||||
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||
fmt.Fprint(w, string(data))
|
||||
}
|
||||
|
||||
func robotsHandler(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Printf("robotsHandler called\n")
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
data, err := ioutil.ReadFile("testRobots.txt")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||
fmt.Fprint(w, string(data))
|
||||
}
|
||||
|
||||
func respond303Handler(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
w.WriteHeader(303)
|
||||
data := "You should get a 303 status!"
|
||||
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||
fmt.Fprint(w, string(data))
|
||||
}
|
||||
|
||||
func respond200Handler(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "text/html")
|
||||
w.WriteHeader(200)
|
||||
data := "You should get a 200 status!"
|
||||
w.Header().Set("Content-Length", fmt.Sprint(len(data)))
|
||||
fmt.Fprint(w, string(data))
|
||||
}
|
||||
|
||||
type customHandlerToAvoidDefaultServeMux struct {
|
||||
NondefaultServeMux *http.ServeMux
|
||||
}
|
||||
|
||||
func (customHandler customHandlerToAvoidDefaultServeMux) ServeHTTP(rw http.ResponseWriter, req *http.Request) {
|
||||
verbosityLevelLogPrint(fmt.Sprintf("call to customHandlerToAvoidDefaultServeMux with request URL: %v", req.URL), 5)
|
||||
customHandler.NondefaultServeMux.ServeHTTP(rw, req)
|
||||
}
|
||||
|
||||
func runSimpleServer(portIn string, callingFunc string) {
|
||||
GLOBAL_VERBOSITY = 5
|
||||
//Need to define a new ServeMux to disambiguate from the main() server
|
||||
testServeMux := http.NewServeMux()
|
||||
testHandler := customHandlerToAvoidDefaultServeMux{NondefaultServeMux: testServeMux}
|
||||
|
||||
testHandler.NondefaultServeMux.HandleFunc("/", rootHandler)
|
||||
testHandler.NondefaultServeMux.HandleFunc("/respond303", respond303Handler)
|
||||
testHandler.NondefaultServeMux.HandleFunc("/respond200", respond200Handler)
|
||||
testHandler.NondefaultServeMux.HandleFunc("/respond200RobotsAllowed", respond200Handler)
|
||||
testHandler.NondefaultServeMux.HandleFunc("/robots.txt", robotsHandler)
|
||||
verbosityLevelLogPrint(fmt.Sprintf("runSimpleServer called by %v\n", callingFunc), 5)
|
||||
verbosityLevelLogPrint(fmt.Sprintf("About to ListenAndServe with params ( %v , %v )", portIn, testHandler), 5)
|
||||
log.Fatal(http.ListenAndServe(portIn, testHandler))
|
||||
|
||||
}
|
4
testRobots.txt
Normal file
4
testRobots.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
disallow: https://localhost:8080
|
||||
disallow: http://localhost:3002/respond200
|
||||
disallow: http://localhost:3000/respond200
|
||||
disallow: /respond200
|
2899
testlog.txt
Normal file
2899
testlog.txt
Normal file
File diff suppressed because it is too large
Load diff
31
whackyHackyHandlerFunc.mermaid
Normal file
31
whackyHackyHandlerFunc.mermaid
Normal file
|
@ -0,0 +1,31 @@
|
|||
stateDiagram-v2
|
||||
direction TB
|
||||
|
||||
accTitle: crawler-blocker Diagram
|
||||
accDescr: Diagram of main.go
|
||||
|
||||
|
||||
classDef systemShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||
classDef subroutine fill: #dff0fe,stroke-width:1px,stroke:black
|
||||
classDef nixShell fill:#00f1,font-weight:bold,stroke-width:1px,stroke:black
|
||||
classDef terminal fill:black,color:#d7e0ff,font-weight:bold,stroke-width:1px,stroke:black
|
||||
|
||||
class nixShellEnvironment nixShell
|
||||
class setConfigSubroutine, nixShellExitSubroutine subroutine
|
||||
class mainFunction systemShell
|
||||
class preEntryState, setConfigSubroutine, coswayCommand, postExitState terminal
|
||||
|
||||
preEntryState --> mainFunction
|
||||
setConfigSubroutine --> setConfigSubroutine
|
||||
|
||||
state "DefaultServeMux (I think) listens on ports 443 and 80, or something" as preEntryState
|
||||
state "HTTP request on 443 triggers entrypoint to whackyHackyHandlerFunc" as preEntryState
|
||||
|
||||
state "whackyHackyHandlerFunc" as whhf {
|
||||
state "set reverseProxyInstance, of type ModifiedReverseProxy." as instantiateReverseProxy
|
||||
state "This type composes ('ineherits' , but not technically somehow) the ReverseProxy type from the stdlib." as instantiateReverseProxy
|
||||
|
||||
|
||||
}
|
||||
|
||||
state "return from whackyHackyHandlerFunc, probably killing a goroutine that was spawned by DefaultServeMux." as return
|
Loading…
Reference in a new issue