'''
Created on 2022-10-25

@author: wf
'''
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen

class HtmlTables(object):
    '''
    HtmlTables extractor
    '''

    def __init__(self, url):
        '''
        Constructor
        '''
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        self.html_page = urlopen(req).read()

        self.soup = BeautifulSoup(self.html_page, 'html.parser')
        
    def get_tables(self,header_tag:str=None)->dict:
        """
        get all tables from my soup as a list of list of dicts
        
        Args:
            header_tag(str): if set search the table name from the given header tag
        
        Return:
            dict: the list of list of dicts for all tables
            
        """
        tables = {}
        for i,table in  enumerate(self.soup.find_all("table")):
            fields = []
            table_data=[]
            category=None
            for tr in table.find_all('tr', recursive=True):
                for th in tr.find_all('th', recursive=True):
                    if "colspan" in th.attrs:
                        category=th.text
                    else:
                        fields.append(th.text)
            for tr in table.find_all('tr', recursive=True):
                record= {}
                for i, td in enumerate(tr.find_all('td', recursive=True)):
                    record[fields[i]] = td.text
                if record:
                    if category:
                        record["category"]=category
                    table_data.append(record)
            if header_tag is not None:
                header=table.find_previous_sibling(header_tag)
                table_name=header.text
            else:
                table_name=f"table{i}"
            tables[table_name]=(table_data)
        return tables
        