#____________________________________Web Scrapper__________________________________
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import os
import json
import re
import lxml

class Web_Scrapper:
    """
    🌐 Web_Scrapper: A universal and powerful web scraping class.
    
    Features:
    - Fetch HTML content from URLs
    - Extract tags, text, scripts, styles, tables, forms
    - Extract links, images, emails, and phone numbers
    - Download and save media
    - Handle headers, user agents, proxies, and timeouts
    - Save and load scraped data as JSON
    - Recursive link crawling
    """

    def __init__(self, user_agent: str = None, timeout: int = 10, proxies: dict = None):
        self.session = requests.Session()
        self.timeout = timeout
        self.headers = {
            "User-Agent": user_agent or 
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/120.0 Safari/537.36"
        }
        self.proxies = proxies
        self.last_url = None
        self.soup = None
        self.content = None

    # ------------------------------------------------------------
    # 🔹 Fetching & Parsing
    # ------------------------------------------------------------
    def fetch(self, url: str, parse: bool = True):
        """Fetch HTML content from a URL. Optionally parse it."""
        try:
            response = self.session.get(url, headers=self.headers, timeout=self.timeout, proxies=self.proxies)
            response.raise_for_status()
            self.last_url = url
            self.content = response.text
            if parse:
                self.soup = BeautifulSoup(self.content, "lxml")
            return True
        except requests.RequestException as e:
            print(f"[❌] Error fetching URL: {e}")
            return False

    def get_html(self):
        """Return the raw HTML of the last fetched page."""
        return self.content

    def parse_html(self, html: str):
        """Parse raw HTML text."""
        self.content = html
        self.soup = BeautifulSoup(html, "lxml")

    # ------------------------------------------------------------
    # 🔹 Extractors
    # ------------------------------------------------------------
    def get_title(self):
        return self.soup.title.string if self.soup and self.soup.title else None

    def get_text(self, selector=None):
        if not self.soup: return None
        if selector:
            elements = self.soup.select(selector)
            return [el.get_text(strip=True) for el in elements]
        return self.soup.get_text(separator="\n", strip=True)

    def get_links(self):
        """Return all hyperlinks on the page."""
        if not self.soup: return []
        links = [urljoin(self.last_url, a["href"]) for a in self.soup.find_all("a", href=True)]
        return list(set(links))

    def get_images(self):
        """Return all image URLs on the page."""
        if not self.soup: return []
        imgs = [urljoin(self.last_url, img["src"]) for img in self.soup.find_all("img", src=True)]
        return list(set(imgs))

    def get_scripts(self):
        """Return all script sources."""
        if not self.soup: return []
        return [urljoin(self.last_url, s.get("src")) for s in self.soup.find_all("script", src=True)]

    def get_styles(self):
        """Return all CSS links."""
        if not self.soup: return []
        return [urljoin(self.last_url, l.get("href")) for l in self.soup.find_all("link", rel="stylesheet")]

    def get_tables(self):
        """Return all tables as list of lists."""
        if not self.soup: return []
        tables = []
        for table in self.soup.find_all("table"):
            rows = []
            for tr in table.find_all("tr"):
                row = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
                rows.append(row)
            tables.append(rows)
        return tables

    def get_forms(self):
        """Return all form action URLs."""
        if not self.soup: return []
        return [urljoin(self.last_url, f.get("action")) for f in self.soup.find_all("form", action=True)]

    def get_meta_tags(self):
        """Return meta tag data as a dictionary."""
        if not self.soup: return {}
        metas = {}
        for tag in self.soup.find_all("meta"):
            name = tag.get("name") or tag.get("property")
            content = tag.get("content")
            if name and content:
                metas[name] = content
        return metas

    def get_emails(self):
        """Extract emails from the page."""
        if not self.content: return []
        return list(set(re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", self.content)))

    def get_phone_numbers(self):
        """Extract phone numbers (basic patterns)."""
        if not self.content: return []
        pattern = r"\+?\d[\d\-\s]{7,}\d"
        return list(set(re.findall(pattern, self.content)))

    def find(self, tag, **kwargs):
        if not self.soup: return None
        return self.soup.find(tag, **kwargs)

    def find_all(self, tag, **kwargs):
        if not self.soup: return []
        return self.soup.find_all(tag, **kwargs)

    # ------------------------------------------------------------
    # 🔹 Download Utilities
    # ------------------------------------------------------------
    def download_file(self, url: str, save_path: str = None):
        try:
            r = self.session.get(url, headers=self.headers, timeout=self.timeout, stream=True)
            r.raise_for_status()
            filename = save_path or os.path.basename(urlparse(url).path)
            with open(filename, "wb") as f:
                for chunk in r.iter_content(1024):
                    f.write(chunk)
            print(f"[✅] Downloaded: {filename}")
            return filename
        except Exception as e:
            print(f"[❌] Failed to download {url}: {e}")
            return None

    def download_all_images(self, folder="images"):
        if not os.path.exists(folder):
            os.makedirs(folder)
        for img_url in self.get_images():
            filename = os.path.join(folder, os.path.basename(urlparse(img_url).path))
            self.download_file(img_url, filename)

    # ------------------------------------------------------------
    # 🔹 Data Handling
    # ------------------------------------------------------------
    def save_json(self, data, filename="scraped_data.json"):
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"[💾] Data saved to {filename}")

    def load_json(self, filename="scraped_data.json"):
        if not os.path.exists(filename):
            print("[⚠️] File not found!")
            return None
        with open(filename, "r", encoding="utf-8") as f:
            return json.load(f)

    # ------------------------------------------------------------
    # 🔹 Utility
    # ------------------------------------------------------------
    def search(self, pattern: str):
        if not self.content: return []
        return re.findall(pattern, self.content, re.IGNORECASE)

    def clear_cache(self):
        self.content = None
        self.soup = None
        self.last_url = None

    def crawl_links(self, depth: int = 1, filter_fn=None):
        """
        Recursively crawl links up to a specified depth.
        Optionally filter URLs using filter_fn(url) -> bool
        """
        visited = set()
        to_visit = [(self.last_url, 0)]
        all_links = []

        while to_visit:
            url, d = to_visit.pop(0)
            if url in visited or d > depth:
                continue
            if self.fetch(url):
                links = self.get_links()
                if filter_fn:
                    links = [l for l in links if filter_fn(l)]
                all_links.extend(links)
                to_visit.extend([(l, d+1) for l in links])
                visited.add(url)
        return list(set(all_links))

    def __repr__(self):
        return f"<Web_Scrapper(url='{self.last_url}')>"
