147 lines
3.5 KiB
Go
147 lines
3.5 KiB
Go
package scraper
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
type ParsedTweet struct {
|
|
ID string
|
|
Author string
|
|
Handle string
|
|
Content string
|
|
Likes int
|
|
Retweets int
|
|
Replies int
|
|
CreatedAt time.Time
|
|
}
|
|
|
|
// ParseTimeline extracts all tweets from a Nitter timeline HTML page.
|
|
func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) {
|
|
doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load HTML document: %w", err)
|
|
}
|
|
|
|
var tweets []ParsedTweet
|
|
|
|
doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) {
|
|
// Only parse actual tweets (not "Show thread" links or "Load more")
|
|
if s.HasClass("show-more") || s.HasClass("more-replies") {
|
|
return
|
|
}
|
|
|
|
tweet := ParsedTweet{}
|
|
|
|
// Author and Handle
|
|
authorBlock := s.Find(".fullname")
|
|
if authorBlock.Length() > 0 {
|
|
tweet.Author = strings.TrimSpace(authorBlock.Text())
|
|
}
|
|
|
|
handleBlock := s.Find(".username")
|
|
if handleBlock.Length() > 0 {
|
|
tweet.Handle = strings.TrimSpace(handleBlock.Text())
|
|
}
|
|
|
|
// Content
|
|
contentBlock := s.Find(".tweet-content")
|
|
if contentBlock.Length() > 0 {
|
|
tweet.Content = strings.TrimSpace(contentBlock.Text())
|
|
}
|
|
|
|
// Link (to get ID)
|
|
linkBlock := s.Find("a.tweet-link")
|
|
if linkBlock.Length() > 0 {
|
|
href, _ := linkBlock.Attr("href")
|
|
parts := strings.Split(href, "/")
|
|
if len(parts) > 0 {
|
|
tweet.ID = parts[len(parts)-1]
|
|
// Nitter sometimes adds #m at the end of links
|
|
tweet.ID = strings.TrimSuffix(tweet.ID, "#m")
|
|
}
|
|
}
|
|
|
|
// Date
|
|
dateBlock := s.Find(".tweet-date a[title]")
|
|
if dateBlock.Length() > 0 {
|
|
titleAttr, _ := dateBlock.Attr("title")
|
|
// Nitter format: "Feb 28, 2026 · 1:23 PM UTC"
|
|
// A rough parsing could be done here, or we just rely on standard formats.
|
|
// For simplicity, we just leave it default Time if we can't parse it quickly.
|
|
if titleAttr != "" {
|
|
parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr)
|
|
if err == nil {
|
|
tweet.CreatedAt = parsedTime
|
|
} else {
|
|
tweet.CreatedAt = time.Now() // Fallback
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stats
|
|
statBlock := s.Find(".tweet-stat")
|
|
statBlock.Each(func(j int, statSel *goquery.Selection) {
|
|
iconContainer := statSel.Find("span.icon-container > span")
|
|
class, exists := iconContainer.Attr("class")
|
|
if !exists {
|
|
return
|
|
}
|
|
|
|
// Find the text value beside the icon
|
|
valStr := strings.TrimSpace(statSel.Text())
|
|
val := parseStatString(valStr)
|
|
|
|
if strings.Contains(class, "icon-comment") {
|
|
tweet.Replies = val
|
|
} else if strings.Contains(class, "icon-retweet") {
|
|
tweet.Retweets = val
|
|
} else if strings.Contains(class, "icon-heart") {
|
|
tweet.Likes = val
|
|
}
|
|
})
|
|
|
|
// Only append if it's a valid parsed tweet
|
|
if tweet.ID != "" && tweet.Content != "" {
|
|
tweets = append(tweets, tweet)
|
|
}
|
|
})
|
|
|
|
return tweets, nil
|
|
}
|
|
|
|
// parseStatString converts string representations like "15.4K" to integer 15400
|
|
func parseStatString(s string) int {
|
|
if s == "" {
|
|
return 0
|
|
}
|
|
s = strings.ReplaceAll(s, ",", "")
|
|
s = strings.ReplaceAll(s, " ", "")
|
|
|
|
multiplier := 1.0
|
|
lower := strings.ToLower(s)
|
|
|
|
if strings.HasSuffix(lower, "k") {
|
|
multiplier = 1000.0
|
|
s = s[:len(s)-1]
|
|
} else if strings.HasSuffix(lower, "m") {
|
|
multiplier = 1000000.0
|
|
s = s[:len(s)-1]
|
|
} else if strings.HasSuffix(lower, "b") {
|
|
multiplier = 1000000000.0
|
|
s = s[:len(s)-1]
|
|
}
|
|
|
|
val, err := strconv.ParseFloat(s, 64)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
return int(val * multiplier)
|
|
}
|