feat: 部署初版测试

2026-03-02 21:25:21 +08:00
parent db3abb3174
commit 8cf6cb944b
97 changed files with 10250 additions and 209 deletions
@@ -0,0 +1,133 @@
+package scraper
+
+import (
+	"crypto/tls"
+	"errors"
+	"fmt"
+	"net/http"
+	"sync"
+	"time"
+
+	"github.com/sony/gobreaker/v2"
+	"golang.org/x/exp/rand"
+)
+
+var (
+	ErrCircuitOpen = errors.New("scraper circuit breaker is open")
+	ErrRateLimited = errors.New("scraper hit rate limit (429)")
+	ErrUnavailable = errors.New("scraper target unavailable (503)")
+)
+
+type ScraperClient struct {
+	http    *http.Client
+	breaker *gobreaker.CircuitBreaker[[]byte]
+	mu      sync.Mutex
+	rng     *rand.Rand
+}
+
+var userAgents = []string{
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
+	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
+}
+
+func NewScraperClient() *ScraperClient {
+	// Custom transport to mask TLS fingerprints somewhat and set timeouts
+	tr := &http.Transport{
+		TLSClientConfig:   &tls.Config{MinVersion: tls.VersionTLS12},
+		ForceAttemptHTTP2: true,
+		MaxIdleConns:      100,
+		IdleConnTimeout:   90 * time.Second,
+	}
+
+	client := &http.Client{
+		Transport: tr,
+		Timeout:   15 * time.Second,
+	}
+
+	// Circuit Breaker: Trip on 5 consecutive failures, wait 60 seconds (Exponential behavior is often custom, but standard half-open helps)
+	st := gobreaker.Settings{
+		Name:        "NitterScraperCB",
+		MaxRequests: 1,
+		Interval:    0,
+		Timeout:     60 * time.Second, // Wait 60s before allowing retry if Open
+		ReadyToTrip: func(counts gobreaker.Counts) bool {
+			return counts.ConsecutiveFailures >= 3
+		},
+	}
+
+	return &ScraperClient{
+		http:    client,
+		breaker: gobreaker.NewCircuitBreaker[[]byte](st),
+		rng:     rand.New(rand.NewSource(uint64(time.Now().UnixNano()))),
+	}
+}
+
+func (c *ScraperClient) getRandomUserAgent() string {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return userAgents[c.rng.Intn(len(userAgents))]
+}
+
+func (c *ScraperClient) JitterDelay(minMs, maxMs int) {
+	c.mu.Lock()
+	delay := minMs + c.rng.Intn(maxMs-minMs)
+	c.mu.Unlock()
+	time.Sleep(time.Duration(delay) * time.Millisecond)
+}
+
+// Fetch returns the raw body byte stream while handling Circuit Breaking and Status checking.
+func (c *ScraperClient) Fetch(url string) ([]byte, error) {
+	respBody, err := c.breaker.Execute(func() ([]byte, error) {
+		req, err := http.NewRequest("GET", url, nil)
+		if err != nil {
+			return nil, err
+		}
+
+		req.Header.Set("User-Agent", c.getRandomUserAgent())
+		req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
+		req.Header.Set("Accept-Language", "en-US,en;q=0.5")
+
+		resp, err := c.http.Do(req)
+		if err != nil {
+			return nil, err
+		}
+		defer resp.Body.Close()
+
+		if resp.StatusCode == http.StatusTooManyRequests {
+			return nil, ErrRateLimited
+		}
+		if resp.StatusCode == http.StatusServiceUnavailable {
+			return nil, ErrUnavailable
+		}
+		if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+			return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
+		}
+
+		// Read to memory in Execute block so if it fails, circuit tracks it. ReadAll is fine for HTML scrapes.
+		var data []byte
+		buf := make([]byte, 1024)
+		for {
+			n, err := resp.Body.Read(buf)
+			if n > 0 {
+				data = append(data, buf[:n]...)
+			}
+			if err != nil {
+				break
+			}
+		}
+
+		return data, nil
+	})
+
+	if err != nil {
+		if err == gobreaker.ErrOpenState {
+			return nil, ErrCircuitOpen
+		}
+		return nil, err
+	}
+
+	return respBody, nil
+}
@@ -0,0 +1,146 @@
+package scraper
+
+import (
+	"bytes"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PuerkitoBio/goquery"
+)
+
+type ParsedTweet struct {
+	ID        string
+	Author    string
+	Handle    string
+	Content   string
+	Likes     int
+	Retweets  int
+	Replies   int
+	CreatedAt time.Time
+}
+
+// ParseTimeline extracts all tweets from a Nitter timeline HTML page.
+func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) {
+	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData))
+	if err != nil {
+		return nil, fmt.Errorf("failed to load HTML document: %w", err)
+	}
+
+	var tweets []ParsedTweet
+
+	doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) {
+		// Only parse actual tweets (not "Show thread" links or "Load more")
+		if s.HasClass("show-more") || s.HasClass("more-replies") {
+			return
+		}
+
+		tweet := ParsedTweet{}
+
+		// Author and Handle
+		authorBlock := s.Find(".fullname")
+		if authorBlock.Length() > 0 {
+			tweet.Author = strings.TrimSpace(authorBlock.Text())
+		}
+		
+		handleBlock := s.Find(".username")
+		if handleBlock.Length() > 0 {
+			tweet.Handle = strings.TrimSpace(handleBlock.Text())
+		}
+
+		// Content
+		contentBlock := s.Find(".tweet-content")
+		if contentBlock.Length() > 0 {
+			tweet.Content = strings.TrimSpace(contentBlock.Text())
+		}
+
+		// Link (to get ID)
+		linkBlock := s.Find("a.tweet-link")
+		if linkBlock.Length() > 0 {
+			href, _ := linkBlock.Attr("href")
+			parts := strings.Split(href, "/")
+			if len(parts) > 0 {
+				tweet.ID = parts[len(parts)-1]
+				// Nitter sometimes adds #m at the end of links
+				tweet.ID = strings.TrimSuffix(tweet.ID, "#m")
+			}
+		}
+
+		// Date
+		dateBlock := s.Find(".tweet-date a[title]")
+		if dateBlock.Length() > 0 {
+			titleAttr, _ := dateBlock.Attr("title")
+			// Nitter format: "Feb 28, 2026 · 1:23 PM UTC"
+			// A rough parsing could be done here, or we just rely on standard formats.
+			// For simplicity, we just leave it default Time if we can't parse it quickly.
+			if titleAttr != "" {
+				parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr)
+				if err == nil {
+					tweet.CreatedAt = parsedTime
+				} else {
+					tweet.CreatedAt = time.Now() // Fallback
+				}
+			}
+		}
+
+		// Stats
+		statBlock := s.Find(".tweet-stat")
+		statBlock.Each(func(j int, statSel *goquery.Selection) {
+			iconContainer := statSel.Find("span.icon-container > span")
+			class, exists := iconContainer.Attr("class")
+			if !exists {
+				return
+			}
+			
+			// Find the text value beside the icon
+			valStr := strings.TrimSpace(statSel.Text())
+			val := parseStatString(valStr)
+
+			if strings.Contains(class, "icon-comment") {
+				tweet.Replies = val
+			} else if strings.Contains(class, "icon-retweet") {
+				tweet.Retweets = val
+			} else if strings.Contains(class, "icon-heart") {
+				tweet.Likes = val
+			}
+		})
+
+		// Only append if it's a valid parsed tweet
+		if tweet.ID != "" && tweet.Content != "" {
+			tweets = append(tweets, tweet)
+		}
+	})
+
+	return tweets, nil
+}
+
+// parseStatString converts string representations like "15.4K" to integer 15400
+func parseStatString(s string) int {
+	if s == "" {
+		return 0
+	}
+	s = strings.ReplaceAll(s, ",", "")
+	s = strings.ReplaceAll(s, " ", "")
+	
+	multiplier := 1.0
+	lower := strings.ToLower(s)
+	
+	if strings.HasSuffix(lower, "k") {
+		multiplier = 1000.0
+		s = s[:len(s)-1]
+	} else if strings.HasSuffix(lower, "m") {
+		multiplier = 1000000.0
+		s = s[:len(s)-1]
+	} else if strings.HasSuffix(lower, "b") {
+		multiplier = 1000000000.0
+		s = s[:len(s)-1]
+	}
+
+	val, err := strconv.ParseFloat(s, 64)
+	if err != nil {
+		return 0
+	}
+
+	return int(val * multiplier)
+}