feat: 部署初版测试
Some checks failed
Extension Build & Release / build (push) Failing after 1m5s
Backend Deploy (Go + Docker) / deploy (push) Failing after 1m40s
Web Console Deploy (Vue 3 + Vite) / deploy (push) Has been cancelled

This commit is contained in:
zs
2026-03-02 21:25:21 +08:00
parent db3abb3174
commit 8cf6cb944b
97 changed files with 10250 additions and 209 deletions

View File

@@ -0,0 +1,133 @@
package scraper
import (
"crypto/tls"
"errors"
"fmt"
"net/http"
"sync"
"time"
"github.com/sony/gobreaker/v2"
"golang.org/x/exp/rand"
)
var (
ErrCircuitOpen = errors.New("scraper circuit breaker is open")
ErrRateLimited = errors.New("scraper hit rate limit (429)")
ErrUnavailable = errors.New("scraper target unavailable (503)")
)
type ScraperClient struct {
http *http.Client
breaker *gobreaker.CircuitBreaker[[]byte]
mu sync.Mutex
rng *rand.Rand
}
var userAgents = []string{
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/112.0",
}
func NewScraperClient() *ScraperClient {
// Custom transport to mask TLS fingerprints somewhat and set timeouts
tr := &http.Transport{
TLSClientConfig: &tls.Config{MinVersion: tls.VersionTLS12},
ForceAttemptHTTP2: true,
MaxIdleConns: 100,
IdleConnTimeout: 90 * time.Second,
}
client := &http.Client{
Transport: tr,
Timeout: 15 * time.Second,
}
// Circuit Breaker: Trip on 5 consecutive failures, wait 60 seconds (Exponential behavior is often custom, but standard half-open helps)
st := gobreaker.Settings{
Name: "NitterScraperCB",
MaxRequests: 1,
Interval: 0,
Timeout: 60 * time.Second, // Wait 60s before allowing retry if Open
ReadyToTrip: func(counts gobreaker.Counts) bool {
return counts.ConsecutiveFailures >= 3
},
}
return &ScraperClient{
http: client,
breaker: gobreaker.NewCircuitBreaker[[]byte](st),
rng: rand.New(rand.NewSource(uint64(time.Now().UnixNano()))),
}
}
func (c *ScraperClient) getRandomUserAgent() string {
c.mu.Lock()
defer c.mu.Unlock()
return userAgents[c.rng.Intn(len(userAgents))]
}
func (c *ScraperClient) JitterDelay(minMs, maxMs int) {
c.mu.Lock()
delay := minMs + c.rng.Intn(maxMs-minMs)
c.mu.Unlock()
time.Sleep(time.Duration(delay) * time.Millisecond)
}
// Fetch returns the raw body byte stream while handling Circuit Breaking and Status checking.
func (c *ScraperClient) Fetch(url string) ([]byte, error) {
respBody, err := c.breaker.Execute(func() ([]byte, error) {
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.Header.Set("User-Agent", c.getRandomUserAgent())
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
resp, err := c.http.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusTooManyRequests {
return nil, ErrRateLimited
}
if resp.StatusCode == http.StatusServiceUnavailable {
return nil, ErrUnavailable
}
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return nil, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
}
// Read to memory in Execute block so if it fails, circuit tracks it. ReadAll is fine for HTML scrapes.
var data []byte
buf := make([]byte, 1024)
for {
n, err := resp.Body.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
}
if err != nil {
break
}
}
return data, nil
})
if err != nil {
if err == gobreaker.ErrOpenState {
return nil, ErrCircuitOpen
}
return nil, err
}
return respBody, nil
}

View File

@@ -0,0 +1,146 @@
package scraper
import (
"bytes"
"fmt"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type ParsedTweet struct {
ID string
Author string
Handle string
Content string
Likes int
Retweets int
Replies int
CreatedAt time.Time
}
// ParseTimeline extracts all tweets from a Nitter timeline HTML page.
func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData))
if err != nil {
return nil, fmt.Errorf("failed to load HTML document: %w", err)
}
var tweets []ParsedTweet
doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) {
// Only parse actual tweets (not "Show thread" links or "Load more")
if s.HasClass("show-more") || s.HasClass("more-replies") {
return
}
tweet := ParsedTweet{}
// Author and Handle
authorBlock := s.Find(".fullname")
if authorBlock.Length() > 0 {
tweet.Author = strings.TrimSpace(authorBlock.Text())
}
handleBlock := s.Find(".username")
if handleBlock.Length() > 0 {
tweet.Handle = strings.TrimSpace(handleBlock.Text())
}
// Content
contentBlock := s.Find(".tweet-content")
if contentBlock.Length() > 0 {
tweet.Content = strings.TrimSpace(contentBlock.Text())
}
// Link (to get ID)
linkBlock := s.Find("a.tweet-link")
if linkBlock.Length() > 0 {
href, _ := linkBlock.Attr("href")
parts := strings.Split(href, "/")
if len(parts) > 0 {
tweet.ID = parts[len(parts)-1]
// Nitter sometimes adds #m at the end of links
tweet.ID = strings.TrimSuffix(tweet.ID, "#m")
}
}
// Date
dateBlock := s.Find(".tweet-date a[title]")
if dateBlock.Length() > 0 {
titleAttr, _ := dateBlock.Attr("title")
// Nitter format: "Feb 28, 2026 · 1:23 PM UTC"
// A rough parsing could be done here, or we just rely on standard formats.
// For simplicity, we just leave it default Time if we can't parse it quickly.
if titleAttr != "" {
parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr)
if err == nil {
tweet.CreatedAt = parsedTime
} else {
tweet.CreatedAt = time.Now() // Fallback
}
}
}
// Stats
statBlock := s.Find(".tweet-stat")
statBlock.Each(func(j int, statSel *goquery.Selection) {
iconContainer := statSel.Find("span.icon-container > span")
class, exists := iconContainer.Attr("class")
if !exists {
return
}
// Find the text value beside the icon
valStr := strings.TrimSpace(statSel.Text())
val := parseStatString(valStr)
if strings.Contains(class, "icon-comment") {
tweet.Replies = val
} else if strings.Contains(class, "icon-retweet") {
tweet.Retweets = val
} else if strings.Contains(class, "icon-heart") {
tweet.Likes = val
}
})
// Only append if it's a valid parsed tweet
if tweet.ID != "" && tweet.Content != "" {
tweets = append(tweets, tweet)
}
})
return tweets, nil
}
// parseStatString converts string representations like "15.4K" to integer 15400
func parseStatString(s string) int {
if s == "" {
return 0
}
s = strings.ReplaceAll(s, ",", "")
s = strings.ReplaceAll(s, " ", "")
multiplier := 1.0
lower := strings.ToLower(s)
if strings.HasSuffix(lower, "k") {
multiplier = 1000.0
s = s[:len(s)-1]
} else if strings.HasSuffix(lower, "m") {
multiplier = 1000000.0
s = s[:len(s)-1]
} else if strings.HasSuffix(lower, "b") {
multiplier = 1000000000.0
s = s[:len(s)-1]
}
val, err := strconv.ParseFloat(s, 64)
if err != nil {
return 0
}
return int(val * multiplier)
}