Files
InsightReply/server/internal/scraper/parser.go
zs 8cf6cb944b
Some checks failed
Extension Build & Release / build (push) Failing after 1m5s
Backend Deploy (Go + Docker) / deploy (push) Failing after 1m40s
Web Console Deploy (Vue 3 + Vite) / deploy (push) Has been cancelled
feat: 部署初版测试
2026-03-02 21:25:21 +08:00

147 lines
3.5 KiB
Go

package scraper
import (
"bytes"
"fmt"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
type ParsedTweet struct {
ID string
Author string
Handle string
Content string
Likes int
Retweets int
Replies int
CreatedAt time.Time
}
// ParseTimeline extracts all tweets from a Nitter timeline HTML page.
func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) {
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData))
if err != nil {
return nil, fmt.Errorf("failed to load HTML document: %w", err)
}
var tweets []ParsedTweet
doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) {
// Only parse actual tweets (not "Show thread" links or "Load more")
if s.HasClass("show-more") || s.HasClass("more-replies") {
return
}
tweet := ParsedTweet{}
// Author and Handle
authorBlock := s.Find(".fullname")
if authorBlock.Length() > 0 {
tweet.Author = strings.TrimSpace(authorBlock.Text())
}
handleBlock := s.Find(".username")
if handleBlock.Length() > 0 {
tweet.Handle = strings.TrimSpace(handleBlock.Text())
}
// Content
contentBlock := s.Find(".tweet-content")
if contentBlock.Length() > 0 {
tweet.Content = strings.TrimSpace(contentBlock.Text())
}
// Link (to get ID)
linkBlock := s.Find("a.tweet-link")
if linkBlock.Length() > 0 {
href, _ := linkBlock.Attr("href")
parts := strings.Split(href, "/")
if len(parts) > 0 {
tweet.ID = parts[len(parts)-1]
// Nitter sometimes adds #m at the end of links
tweet.ID = strings.TrimSuffix(tweet.ID, "#m")
}
}
// Date
dateBlock := s.Find(".tweet-date a[title]")
if dateBlock.Length() > 0 {
titleAttr, _ := dateBlock.Attr("title")
// Nitter format: "Feb 28, 2026 · 1:23 PM UTC"
// A rough parsing could be done here, or we just rely on standard formats.
// For simplicity, we just leave it default Time if we can't parse it quickly.
if titleAttr != "" {
parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr)
if err == nil {
tweet.CreatedAt = parsedTime
} else {
tweet.CreatedAt = time.Now() // Fallback
}
}
}
// Stats
statBlock := s.Find(".tweet-stat")
statBlock.Each(func(j int, statSel *goquery.Selection) {
iconContainer := statSel.Find("span.icon-container > span")
class, exists := iconContainer.Attr("class")
if !exists {
return
}
// Find the text value beside the icon
valStr := strings.TrimSpace(statSel.Text())
val := parseStatString(valStr)
if strings.Contains(class, "icon-comment") {
tweet.Replies = val
} else if strings.Contains(class, "icon-retweet") {
tweet.Retweets = val
} else if strings.Contains(class, "icon-heart") {
tweet.Likes = val
}
})
// Only append if it's a valid parsed tweet
if tweet.ID != "" && tweet.Content != "" {
tweets = append(tweets, tweet)
}
})
return tweets, nil
}
// parseStatString converts string representations like "15.4K" to integer 15400
func parseStatString(s string) int {
if s == "" {
return 0
}
s = strings.ReplaceAll(s, ",", "")
s = strings.ReplaceAll(s, " ", "")
multiplier := 1.0
lower := strings.ToLower(s)
if strings.HasSuffix(lower, "k") {
multiplier = 1000.0
s = s[:len(s)-1]
} else if strings.HasSuffix(lower, "m") {
multiplier = 1000000.0
s = s[:len(s)-1]
} else if strings.HasSuffix(lower, "b") {
multiplier = 1000000000.0
s = s[:len(s)-1]
}
val, err := strconv.ParseFloat(s, 64)
if err != nil {
return 0
}
return int(val * multiplier)
}