package scraper import ( "bytes" "fmt" "strconv" "strings" "time" "github.com/PuerkitoBio/goquery" ) type ParsedTweet struct { ID string Author string Handle string Content string Likes int Retweets int Replies int CreatedAt time.Time } // ParseTimeline extracts all tweets from a Nitter timeline HTML page. func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) { doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData)) if err != nil { return nil, fmt.Errorf("failed to load HTML document: %w", err) } var tweets []ParsedTweet doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) { // Only parse actual tweets (not "Show thread" links or "Load more") if s.HasClass("show-more") || s.HasClass("more-replies") { return } tweet := ParsedTweet{} // Author and Handle authorBlock := s.Find(".fullname") if authorBlock.Length() > 0 { tweet.Author = strings.TrimSpace(authorBlock.Text()) } handleBlock := s.Find(".username") if handleBlock.Length() > 0 { tweet.Handle = strings.TrimSpace(handleBlock.Text()) } // Content contentBlock := s.Find(".tweet-content") if contentBlock.Length() > 0 { tweet.Content = strings.TrimSpace(contentBlock.Text()) } // Link (to get ID) linkBlock := s.Find("a.tweet-link") if linkBlock.Length() > 0 { href, _ := linkBlock.Attr("href") parts := strings.Split(href, "/") if len(parts) > 0 { tweet.ID = parts[len(parts)-1] // Nitter sometimes adds #m at the end of links tweet.ID = strings.TrimSuffix(tweet.ID, "#m") } } // Date dateBlock := s.Find(".tweet-date a[title]") if dateBlock.Length() > 0 { titleAttr, _ := dateBlock.Attr("title") // Nitter format: "Feb 28, 2026 · 1:23 PM UTC" // A rough parsing could be done here, or we just rely on standard formats. // For simplicity, we just leave it default Time if we can't parse it quickly. if titleAttr != "" { parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr) if err == nil { tweet.CreatedAt = parsedTime } else { tweet.CreatedAt = time.Now() // Fallback } } } // Stats statBlock := s.Find(".tweet-stat") statBlock.Each(func(j int, statSel *goquery.Selection) { iconContainer := statSel.Find("span.icon-container > span") class, exists := iconContainer.Attr("class") if !exists { return } // Find the text value beside the icon valStr := strings.TrimSpace(statSel.Text()) val := parseStatString(valStr) if strings.Contains(class, "icon-comment") { tweet.Replies = val } else if strings.Contains(class, "icon-retweet") { tweet.Retweets = val } else if strings.Contains(class, "icon-heart") { tweet.Likes = val } }) // Only append if it's a valid parsed tweet if tweet.ID != "" && tweet.Content != "" { tweets = append(tweets, tweet) } }) return tweets, nil } // parseStatString converts string representations like "15.4K" to integer 15400 func parseStatString(s string) int { if s == "" { return 0 } s = strings.ReplaceAll(s, ",", "") s = strings.ReplaceAll(s, " ", "") multiplier := 1.0 lower := strings.ToLower(s) if strings.HasSuffix(lower, "k") { multiplier = 1000.0 s = s[:len(s)-1] } else if strings.HasSuffix(lower, "m") { multiplier = 1000000.0 s = s[:len(s)-1] } else if strings.HasSuffix(lower, "b") { multiplier = 1000000000.0 s = s[:len(s)-1] } val, err := strconv.ParseFloat(s, 64) if err != nil { return 0 } return int(val * multiplier) }