134 lines
3.6 KiB
Go
134 lines
3.6 KiB
Go
package scraper
|
|
|
|
import (
|
|
"crypto/tls"
|
|
"errors"
|
|
"fmt"
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/sony/gobreaker/v2"
|
|
"golang.org/x/exp/rand"
|
|
)
|
|
|
|
var (
|
|
ErrCircuitOpen = errors.New("scraper circuit breaker is open")
|
|
ErrRateLimited = errors.New("scraper hit rate limit (429)")
|
|
ErrUnavailable = errors.New("scraper target unavailable (503)")
|
|
)
|
|
|
|
type ScraperClient struct {
|
|
http *http.Client
|
|
breaker *gobreaker.CircuitBreaker[[]byte]
|
|
mu sync.Mutex
|
|
rng *rand.Rand
|
|
}
|
|
|
|
var userAgents = []string{
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
|
}
|
|
|
|
func NewScraperClient() *ScraperClient {
|
|
// Custom transport to mask TLS fingerprints somewhat and set timeouts
|
|
tr := &http.Transport{
|
|
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
|
|
ForceAttemptHTTP2: true,
|
|
MaxIdleConns: 100,
|
|
IdleConnTimeout: 90 * time.Second,
|
|
}
|
|
|
|
client := &http.Client{
|
|
Transport: tr,
|
|
Timeout: 15 * time.Second,
|
|
}
|
|
|
|
// Circuit Breaker: Trip on 5 consecutive failures, wait 60 seconds (Exponential behavior is often custom, but standard half-open helps)
|
|
st := gobreaker.Settings{
|
|
Name: "NitterScraperCB",
|
|
MaxRequests: 1,
|
|
Interval: 0,
|
|
Timeout: 60 * time.Second, // Wait 60s before allowing retry if Open
|
|
ReadyToTrip: func(counts gobreaker.Counts) bool {
|
|
return counts.ConsecutiveFailures >= 3
|
|
},
|
|
}
|
|
|
|
return &ScraperClient{
|
|
http: client,
|
|
breaker: gobreaker.NewCircuitBreaker[[]byte](st),
|
|
rng: rand.New(rand.NewSource(uint64(time.Now().UnixNano()))),
|
|
}
|
|
}
|
|
|
|
func (c *ScraperClient) getRandomUserAgent() string {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
return userAgents[c.rng.Intn(len(userAgents))]
|
|
}
|
|
|
|
func (c *ScraperClient) JitterDelay(minMs, maxMs int) {
|
|
c.mu.Lock()
|
|
delay := minMs + c.rng.Intn(maxMs-minMs)
|
|
c.mu.Unlock()
|
|
time.Sleep(time.Duration(delay) * time.Millisecond)
|
|
}
|
|
|
|
// Fetch returns the raw body byte stream while handling Circuit Breaking and Status checking.
|
|
func (c *ScraperClient) Fetch(url string) ([]byte, error) {
|
|
respBody, err := c.breaker.Execute(func() ([]byte, error) {
|
|
req, err := http.NewRequest("GET", url, nil)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
req.Header.Set("User-Agent", c.getRandomUserAgent())
|
|
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
|
|
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
|
|
|
resp, err := c.http.Do(req)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode == http.StatusTooManyRequests {
|
|
return nil, ErrRateLimited
|
|
}
|
|
if resp.StatusCode == http.StatusServiceUnavailable {
|
|
return nil, ErrUnavailable
|
|
}
|
|
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
|
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
|
}
|
|
|
|
// Read to memory in Execute block so if it fails, circuit tracks it. ReadAll is fine for HTML scrapes.
|
|
var data []byte
|
|
buf := make([]byte, 1024)
|
|
for {
|
|
n, err := resp.Body.Read(buf)
|
|
if n > 0 {
|
|
data = append(data, buf[:n]...)
|
|
}
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
|
|
return data, nil
|
|
})
|
|
|
|
if err != nil {
|
|
if err == gobreaker.ErrOpenState {
|
|
return nil, ErrCircuitOpen
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
return respBody, nil
|
|
}
|