feat: 部署初版测试
This commit is contained in:
133
server/internal/scraper/client.go
Normal file
133
server/internal/scraper/client.go
Normal file
@@ -0,0 +1,133 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/sony/gobreaker/v2"
|
||||
"golang.org/x/exp/rand"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrCircuitOpen = errors.New("scraper circuit breaker is open")
|
||||
ErrRateLimited = errors.New("scraper hit rate limit (429)")
|
||||
ErrUnavailable = errors.New("scraper target unavailable (503)")
|
||||
)
|
||||
|
||||
type ScraperClient struct {
|
||||
http *http.Client
|
||||
breaker *gobreaker.CircuitBreaker[[]byte]
|
||||
mu sync.Mutex
|
||||
rng *rand.Rand
|
||||
}
|
||||
|
||||
var userAgents = []string{
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
|
||||
}
|
||||
|
||||
func NewScraperClient() *ScraperClient {
|
||||
// Custom transport to mask TLS fingerprints somewhat and set timeouts
|
||||
tr := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
|
||||
ForceAttemptHTTP2: true,
|
||||
MaxIdleConns: 100,
|
||||
IdleConnTimeout: 90 * time.Second,
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
Transport: tr,
|
||||
Timeout: 15 * time.Second,
|
||||
}
|
||||
|
||||
// Circuit Breaker: Trip on 5 consecutive failures, wait 60 seconds (Exponential behavior is often custom, but standard half-open helps)
|
||||
st := gobreaker.Settings{
|
||||
Name: "NitterScraperCB",
|
||||
MaxRequests: 1,
|
||||
Interval: 0,
|
||||
Timeout: 60 * time.Second, // Wait 60s before allowing retry if Open
|
||||
ReadyToTrip: func(counts gobreaker.Counts) bool {
|
||||
return counts.ConsecutiveFailures >= 3
|
||||
},
|
||||
}
|
||||
|
||||
return &ScraperClient{
|
||||
http: client,
|
||||
breaker: gobreaker.NewCircuitBreaker[[]byte](st),
|
||||
rng: rand.New(rand.NewSource(uint64(time.Now().UnixNano()))),
|
||||
}
|
||||
}
|
||||
|
||||
func (c *ScraperClient) getRandomUserAgent() string {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
return userAgents[c.rng.Intn(len(userAgents))]
|
||||
}
|
||||
|
||||
func (c *ScraperClient) JitterDelay(minMs, maxMs int) {
|
||||
c.mu.Lock()
|
||||
delay := minMs + c.rng.Intn(maxMs-minMs)
|
||||
c.mu.Unlock()
|
||||
time.Sleep(time.Duration(delay) * time.Millisecond)
|
||||
}
|
||||
|
||||
// Fetch returns the raw body byte stream while handling Circuit Breaking and Status checking.
|
||||
func (c *ScraperClient) Fetch(url string) ([]byte, error) {
|
||||
respBody, err := c.breaker.Execute(func() ([]byte, error) {
|
||||
req, err := http.NewRequest("GET", url, nil)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
req.Header.Set("User-Agent", c.getRandomUserAgent())
|
||||
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
|
||||
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
|
||||
|
||||
resp, err := c.http.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusTooManyRequests {
|
||||
return nil, ErrRateLimited
|
||||
}
|
||||
if resp.StatusCode == http.StatusServiceUnavailable {
|
||||
return nil, ErrUnavailable
|
||||
}
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Read to memory in Execute block so if it fails, circuit tracks it. ReadAll is fine for HTML scrapes.
|
||||
var data []byte
|
||||
buf := make([]byte, 1024)
|
||||
for {
|
||||
n, err := resp.Body.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
}
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return data, nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
if err == gobreaker.ErrOpenState {
|
||||
return nil, ErrCircuitOpen
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return respBody, nil
|
||||
}
|
||||
146
server/internal/scraper/parser.go
Normal file
146
server/internal/scraper/parser.go
Normal file
@@ -0,0 +1,146 @@
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
)
|
||||
|
||||
type ParsedTweet struct {
|
||||
ID string
|
||||
Author string
|
||||
Handle string
|
||||
Content string
|
||||
Likes int
|
||||
Retweets int
|
||||
Replies int
|
||||
CreatedAt time.Time
|
||||
}
|
||||
|
||||
// ParseTimeline extracts all tweets from a Nitter timeline HTML page.
|
||||
func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) {
|
||||
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load HTML document: %w", err)
|
||||
}
|
||||
|
||||
var tweets []ParsedTweet
|
||||
|
||||
doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) {
|
||||
// Only parse actual tweets (not "Show thread" links or "Load more")
|
||||
if s.HasClass("show-more") || s.HasClass("more-replies") {
|
||||
return
|
||||
}
|
||||
|
||||
tweet := ParsedTweet{}
|
||||
|
||||
// Author and Handle
|
||||
authorBlock := s.Find(".fullname")
|
||||
if authorBlock.Length() > 0 {
|
||||
tweet.Author = strings.TrimSpace(authorBlock.Text())
|
||||
}
|
||||
|
||||
handleBlock := s.Find(".username")
|
||||
if handleBlock.Length() > 0 {
|
||||
tweet.Handle = strings.TrimSpace(handleBlock.Text())
|
||||
}
|
||||
|
||||
// Content
|
||||
contentBlock := s.Find(".tweet-content")
|
||||
if contentBlock.Length() > 0 {
|
||||
tweet.Content = strings.TrimSpace(contentBlock.Text())
|
||||
}
|
||||
|
||||
// Link (to get ID)
|
||||
linkBlock := s.Find("a.tweet-link")
|
||||
if linkBlock.Length() > 0 {
|
||||
href, _ := linkBlock.Attr("href")
|
||||
parts := strings.Split(href, "/")
|
||||
if len(parts) > 0 {
|
||||
tweet.ID = parts[len(parts)-1]
|
||||
// Nitter sometimes adds #m at the end of links
|
||||
tweet.ID = strings.TrimSuffix(tweet.ID, "#m")
|
||||
}
|
||||
}
|
||||
|
||||
// Date
|
||||
dateBlock := s.Find(".tweet-date a[title]")
|
||||
if dateBlock.Length() > 0 {
|
||||
titleAttr, _ := dateBlock.Attr("title")
|
||||
// Nitter format: "Feb 28, 2026 · 1:23 PM UTC"
|
||||
// A rough parsing could be done here, or we just rely on standard formats.
|
||||
// For simplicity, we just leave it default Time if we can't parse it quickly.
|
||||
if titleAttr != "" {
|
||||
parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr)
|
||||
if err == nil {
|
||||
tweet.CreatedAt = parsedTime
|
||||
} else {
|
||||
tweet.CreatedAt = time.Now() // Fallback
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Stats
|
||||
statBlock := s.Find(".tweet-stat")
|
||||
statBlock.Each(func(j int, statSel *goquery.Selection) {
|
||||
iconContainer := statSel.Find("span.icon-container > span")
|
||||
class, exists := iconContainer.Attr("class")
|
||||
if !exists {
|
||||
return
|
||||
}
|
||||
|
||||
// Find the text value beside the icon
|
||||
valStr := strings.TrimSpace(statSel.Text())
|
||||
val := parseStatString(valStr)
|
||||
|
||||
if strings.Contains(class, "icon-comment") {
|
||||
tweet.Replies = val
|
||||
} else if strings.Contains(class, "icon-retweet") {
|
||||
tweet.Retweets = val
|
||||
} else if strings.Contains(class, "icon-heart") {
|
||||
tweet.Likes = val
|
||||
}
|
||||
})
|
||||
|
||||
// Only append if it's a valid parsed tweet
|
||||
if tweet.ID != "" && tweet.Content != "" {
|
||||
tweets = append(tweets, tweet)
|
||||
}
|
||||
})
|
||||
|
||||
return tweets, nil
|
||||
}
|
||||
|
||||
// parseStatString converts string representations like "15.4K" to integer 15400
|
||||
func parseStatString(s string) int {
|
||||
if s == "" {
|
||||
return 0
|
||||
}
|
||||
s = strings.ReplaceAll(s, ",", "")
|
||||
s = strings.ReplaceAll(s, " ", "")
|
||||
|
||||
multiplier := 1.0
|
||||
lower := strings.ToLower(s)
|
||||
|
||||
if strings.HasSuffix(lower, "k") {
|
||||
multiplier = 1000.0
|
||||
s = s[:len(s)-1]
|
||||
} else if strings.HasSuffix(lower, "m") {
|
||||
multiplier = 1000000.0
|
||||
s = s[:len(s)-1]
|
||||
} else if strings.HasSuffix(lower, "b") {
|
||||
multiplier = 1000000000.0
|
||||
s = s[:len(s)-1]
|
||||
}
|
||||
|
||||
val, err := strconv.ParseFloat(s, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
|
||||
return int(val * multiplier)
|
||||
}
|
||||
Reference in New Issue
Block a user