InsightReply/server/internal/scraper/parser.go

package scraper

import (
	"bytes"
	"fmt"
	"strconv"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
)

type ParsedTweet struct {
	ID        string
	Author    string
	Handle    string
	Content   string
	Likes     int
	Retweets  int
	Replies   int
	CreatedAt time.Time
}

// ParseTimeline extracts all tweets from a Nitter timeline HTML page.
func ParseTimeline(htmlData []byte) ([]ParsedTweet, error) {
	doc, err := goquery.NewDocumentFromReader(bytes.NewReader(htmlData))
	if err != nil {
		return nil, fmt.Errorf("failed to load HTML document: %w", err)
	}

	var tweets []ParsedTweet

	doc.Find(".timeline-item").Each(func(i int, s *goquery.Selection) {
		// Only parse actual tweets (not "Show thread" links or "Load more")
		if s.HasClass("show-more") || s.HasClass("more-replies") {
			return
		}

		tweet := ParsedTweet{}

		// Author and Handle
		authorBlock := s.Find(".fullname")
		if authorBlock.Length() > 0 {
			tweet.Author = strings.TrimSpace(authorBlock.Text())
		}

		handleBlock := s.Find(".username")
		if handleBlock.Length() > 0 {
			tweet.Handle = strings.TrimSpace(handleBlock.Text())
		}

		// Content
		contentBlock := s.Find(".tweet-content")
		if contentBlock.Length() > 0 {
			tweet.Content = strings.TrimSpace(contentBlock.Text())
		}

		// Link (to get ID)
		linkBlock := s.Find("a.tweet-link")
		if linkBlock.Length() > 0 {
			href, _ := linkBlock.Attr("href")
			parts := strings.Split(href, "/")
			if len(parts) > 0 {
				tweet.ID = parts[len(parts)-1]
				// Nitter sometimes adds #m at the end of links
				tweet.ID = strings.TrimSuffix(tweet.ID, "#m")
			}
		}

		// Date
		dateBlock := s.Find(".tweet-date a[title]")
		if dateBlock.Length() > 0 {
			titleAttr, _ := dateBlock.Attr("title")
			// Nitter format: "Feb 28, 2026 · 1:23 PM UTC"
			// A rough parsing could be done here, or we just rely on standard formats.
			// For simplicity, we just leave it default Time if we can't parse it quickly.
			if titleAttr != "" {
				parsedTime, err := time.Parse("Jan 2, 2006 · 3:04 PM MST", titleAttr)
				if err == nil {
					tweet.CreatedAt = parsedTime
				} else {
					tweet.CreatedAt = time.Now() // Fallback
				}
			}
		}

		// Stats
		statBlock := s.Find(".tweet-stat")
		statBlock.Each(func(j int, statSel *goquery.Selection) {
			iconContainer := statSel.Find("span.icon-container > span")
			class, exists := iconContainer.Attr("class")
			if !exists {
				return
			}

			// Find the text value beside the icon
			valStr := strings.TrimSpace(statSel.Text())
			val := parseStatString(valStr)

			if strings.Contains(class, "icon-comment") {
				tweet.Replies = val
			} else if strings.Contains(class, "icon-retweet") {
				tweet.Retweets = val
			} else if strings.Contains(class, "icon-heart") {
				tweet.Likes = val
			}
		})

		// Only append if it's a valid parsed tweet
		if tweet.ID != "" && tweet.Content != "" {
			tweets = append(tweets, tweet)
		}
	})

	return tweets, nil
}

// parseStatString converts string representations like "15.4K" to integer 15400
func parseStatString(s string) int {
	if s == "" {
		return 0
	}
	s = strings.ReplaceAll(s, ",", "")
	s = strings.ReplaceAll(s, " ", "")

	multiplier := 1.0
	lower := strings.ToLower(s)

	if strings.HasSuffix(lower, "k") {
		multiplier = 1000.0
		s = s[:len(s)-1]
	} else if strings.HasSuffix(lower, "m") {
		multiplier = 1000000.0
		s = s[:len(s)-1]
	} else if strings.HasSuffix(lower, "b") {
		multiplier = 1000000000.0
		s = s[:len(s)-1]
	}

	val, err := strconv.ParseFloat(s, 64)
	if err != nil {
		return 0
	}

	return int(val * multiplier)
}