package scraper import ( "crypto/tls" "errors" "fmt" "net/http" "sync" "time" "github.com/sony/gobreaker/v2" "golang.org/x/exp/rand" ) var ( ErrCircuitOpen = errors.New("scraper circuit breaker is open") ErrRateLimited = errors.New("scraper hit rate limit (429)") ErrUnavailable = errors.New("scraper target unavailable (503)") ) type ScraperClient struct { http *http.Client breaker *gobreaker.CircuitBreaker[[]byte] mu sync.Mutex rng *rand.Rand } var userAgents = []string{ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0", } func NewScraperClient() *ScraperClient { // Custom transport to mask TLS fingerprints somewhat and set timeouts tr := &http.Transport{ TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12}, ForceAttemptHTTP2: true, MaxIdleConns: 100, IdleConnTimeout: 90 * time.Second, } client := &http.Client{ Transport: tr, Timeout: 15 * time.Second, } // Circuit Breaker: Trip on 5 consecutive failures, wait 60 seconds (Exponential behavior is often custom, but standard half-open helps) st := gobreaker.Settings{ Name: "NitterScraperCB", MaxRequests: 1, Interval: 0, Timeout: 60 * time.Second, // Wait 60s before allowing retry if Open ReadyToTrip: func(counts gobreaker.Counts) bool { return counts.ConsecutiveFailures >= 3 }, } return &ScraperClient{ http: client, breaker: gobreaker.NewCircuitBreaker[[]byte](st), rng: rand.New(rand.NewSource(uint64(time.Now().UnixNano()))), } } func (c *ScraperClient) getRandomUserAgent() string { c.mu.Lock() defer c.mu.Unlock() return userAgents[c.rng.Intn(len(userAgents))] } func (c *ScraperClient) JitterDelay(minMs, maxMs int) { c.mu.Lock() delay := minMs + c.rng.Intn(maxMs-minMs) c.mu.Unlock() time.Sleep(time.Duration(delay) * time.Millisecond) } // Fetch returns the raw body byte stream while handling Circuit Breaking and Status checking. func (c *ScraperClient) Fetch(url string) ([]byte, error) { respBody, err := c.breaker.Execute(func() ([]byte, error) { req, err := http.NewRequest("GET", url, nil) if err != nil { return nil, err } req.Header.Set("User-Agent", c.getRandomUserAgent()) req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8") req.Header.Set("Accept-Language", "en-US,en;q=0.5") resp, err := c.http.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode == http.StatusTooManyRequests { return nil, ErrRateLimited } if resp.StatusCode == http.StatusServiceUnavailable { return nil, ErrUnavailable } if resp.StatusCode < 200 || resp.StatusCode >= 300 { return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode) } // Read to memory in Execute block so if it fails, circuit tracks it. ReadAll is fine for HTML scrapes. var data []byte buf := make([]byte, 1024) for { n, err := resp.Body.Read(buf) if n > 0 { data = append(data, buf[:n]...) } if err != nil { break } } return data, nil }) if err != nil { if err == gobreaker.ErrOpenState { return nil, ErrCircuitOpen } return nil, err } return respBody, nil }