import re
import requests

def test_scrape():
    url = "https://www.instagram.com/p/DVOazPSjJQm/?igsh=aGJlaXpyb3E4OXFn"
    code_match = re.search(r'instagram\.com/(?:p|reel|reels)/([A-Za-z0-9_-]+)', url)
    if not code_match:
        print("No post code matched")
        return
        
    post_code = code_match.group(1)
    embed_url = f"https://www.instagram.com/p/{post_code}/embed/"
    
    print(f"Fetching {embed_url}")
    embed_res = requests.get(embed_url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }, timeout=15)
    
    embed_html = embed_res.text
    print(f"Got {len(embed_html)} bytes")
    
    likes = 0
    comments = 0
    author = ""
    title = ""
    
    # 1. Exact extraction from `likeCountClick` anchor tag provided by user
    like_pattern = r'likeCountClick[^>]*>\s*([\d,]+)\s*likes?'
    likes_m = re.search(like_pattern, embed_html, re.IGNORECASE)
    if likes_m:
        likes = int(likes_m.group(1).replace(',', ''))
        print(f"[HTTP Embed] Extracted likes from anchor: {likes}")
    
    # Fallback: any "N likes" visible in the UI
    if likes == 0:
        fb_likes = re.search(r'>\s*([\d,]+)\s*likes?<', embed_html, re.IGNORECASE)
        if fb_likes:
            likes = int(fb_likes.group(1).replace(',', ''))
            print(f"[HTTP Embed] Regex likes fallback: {likes}")
    
    # Broader fallback: JSON-LD or edge_media patterns (Highly reliable)
    if likes == 0:
        json_likes = re.search(r'"edge_media_preview_like"\s*:\s*\{\s*"count"\s*:\s*(\d+)', embed_html)
        if json_likes:
            likes = int(json_likes.group(1))
            print(f"[HTTP Embed] JSON likes: {likes}")
            
    # Comments
    comment_pattern = r'commentCountClick[^>]*>\s*([\d,]+)\s*comments?'
    comments_m = re.search(comment_pattern, embed_html, re.IGNORECASE)
    if comments_m:
        comments = int(comments_m.group(1).replace(',', ''))
        print(f"[HTTP Embed] Extracted comments from anchor: {comments}")
    
    if comments == 0:
        fb_comments = re.search(r'>\s*([\d,]+)\s*comments?<', embed_html, re.IGNORECASE)
        if fb_comments:
            comments = int(fb_comments.group(1).replace(',', ''))
            print(f"[HTTP Embed] Regex comments fallback: {comments}")
            
    if comments == 0:
        json_comments = re.search(r'"edge_media_to_comment"\s*:\s*\{\s*"count"\s*:\s*(\d+)', embed_html)
        if json_comments:
            comments = int(json_comments.group(1))
            print(f"[HTTP Embed] JSON comments: {comments}")
        else:
            # Sometimes comment count is just exposed as `comment_count: N` in inline JS
            json_comments_alt = re.search(r'"comment_count"\s*:\s*(\d+)', embed_html)
            if json_comments_alt:
                comments = int(json_comments_alt.group(1))
                print(f"[HTTP Embed] JSON alt comments: {comments}")

test_scrape()
