
# Composite Bridge Keys and Composite Tables Generator for the Hypercube
# ---------------------------------------------------------
# This class automates the process of generating composite keys and link tables
# for a set of relational tables
#
# Purpose:
# - When two or more tables are linked by more than one column (they share multiple columns),
#   we need to combine these columns into a single composite key that uniquely represents the relationship.
# - This class detects such multi-column links, generates a composite key by merging the values
#   of the shared columns, and uses this key to efficiently link tables.
# - The idea is to maintain a schema that is suitable for the queries while keeping the underlying data intact.



import pandas as pd
from typing import Dict, Any, List

class CompositeBridgeGenerator:
    def __init__(
        self,
        tables: Dict[str, pd.DataFrame],
        print_info: bool = False
    ) -> None:
        """Initialize the model with tables and create link tables before adding relationships."""
        self.tables: Dict[str, pd.DataFrame] = tables  # all tables (including link tables)
        self.composite_tables: Dict[str, pd.DataFrame] = {}  # the link tables generated by this class
        self.composite_keys: Dict[str, Any] = {}  # the composite keys for each table
        self.column_combinations: List[Any] = []  # Track shared column combinations
        self.key_mapping: Dict[Any, Any] = {}  # Store the mapping of composite keys to auto-incremented keys
        self.column_table_matrix = self._create_column_table_matrix()

        # Identify and create link tables based on shared columns
        if self._create_link_tables():
            if print_info:
                print("CompositeBridge table/s created")
                for table in self.composite_tables:
                    print(' - ', table, '->', self.tables[table].columns.to_list())

            # After creating link tables, apply composite keys
            self._apply_composite_keys()

            # Replace composite keys with auto-incremented keys
            self._replace_with_autonumbered_keys()
        

    def _create_column_table_matrix(self):
        """Create a column-to-table matrix showing which columns belong to which tables."""
        all_columns = set()
        for table_name, table_data in self.tables.items():
            all_columns.update(table_data.columns)

        # Create an MxN matrix with columns as rows and tables as columns
        matrix = pd.DataFrame(0, index=list(all_columns), columns=self.tables.keys())

        # Fill the matrix
        for table_name, table_data in self.tables.items():
            for column in table_data.columns:
                matrix.at[column, table_name] = 1

        return matrix

    def _create_composite_key(self, df, columns):
        """Create a composite key by concatenating the values in the given columns."""
        return df[columns].astype(str).agg('-'.join, axis=1)

    def _create_link_tables(self):
        """Create link tables for shared columns."""
        # Filter the matrix to find shared columns (columns present in more than one table)
        shared_columns_matrix = self.column_table_matrix.loc[(self.column_table_matrix.sum(axis=1) > 1)]

        # List of shared columns
        shared_columns = shared_columns_matrix.index.tolist()

        # Initialize variables to track processed and unprocessed tables
        unprocessed_tables = set(shared_columns_matrix.columns.tolist())

        # Continue processing while there are still unprocessed tables
        while unprocessed_tables:
            # Start with an arbitrary unprocessed table
            current_table_name = unprocessed_tables.pop()
            # Get the shared columns for that table
            current_columns = [col for col in self.tables[current_table_name].columns if col in shared_columns]
            # Create a copy of the table's shared columns
            current_table = self.tables[current_table_name][current_columns]

            # Track tables processed in this iteration
            processed_in_iteration = [current_table_name]
            multi_column = False

            # Iterate over the remaining tables to see if they share columns with the current table
            for next_table_name in list(unprocessed_tables):
                next_columns = [col for col in self.tables[next_table_name].columns if col in shared_columns]

                # Find the common columns to join on
                columns_to_join = list(set(current_columns).intersection(next_columns))

                if len(columns_to_join) > 1:
                    # Track this combination of columns
                    self.column_combinations.append(columns_to_join)

                    # Perform the outer join on the common columns
                    current_table = pd.merge(
                        current_table,
                        self.tables[next_table_name][next_columns],
                        on=columns_to_join,
                        how='outer'
                    )

                    # Update current columns to reflect the columns of the merged table
                    current_columns = current_table.columns.tolist()

                    # Mark this table as processed in this iteration
                    processed_in_iteration.append(next_table_name)

                    multi_column = True

            # After processing all possible joins, add the final composite table to the list
            if multi_column:
                current_table = current_table.drop_duplicates()
                composite_table_name = "_composite_" + "_".join(sorted(processed_in_iteration))
                self.tables[composite_table_name] = current_table
                self.composite_tables[composite_table_name] = current_table

            # Remove processed tables from unprocessed set
            unprocessed_tables -= set(processed_in_iteration)

        # For each composite table, keep only columns that are in any of the column combinations
        columns_to_keep = set()
        for combo in self.column_combinations:
            columns_to_keep.update(combo)
        for composite_table_name in self.composite_tables:
            columns_in_table = set(self.tables[composite_table_name].columns)            
            final_columns = list(set([col for col in columns_in_table if col in columns_to_keep]))
            self.tables[composite_table_name] = self.tables[composite_table_name][final_columns].drop_duplicates()
        
        if self.composite_tables:
            return True
        else:
            return False

    def _apply_composite_keys(self):
        """For all tables, create composite keys based on self.column_combinations."""
        # Apply composite keys to all tables based on the stored column combinations
        for table_name, table_data in self.tables.items():
            for columns in self.column_combinations:
                # Check if the table has all the columns in the combination
                if all(col in table_data.columns for col in columns):
                    # Create composite key
                    composite_key_column_name = f"_composite_key_{'_'.join(columns)}"
                    self.tables[table_name][composite_key_column_name] = self._create_composite_key(self.tables[table_name], columns)

                    # Track the composite key for the table
                    if table_name not in self.composite_keys:
                        self.composite_keys[table_name] = []
                    self.composite_keys[table_name].append(composite_key_column_name)

                    # Rename the original shared columns, only from non-link tables
                    if table_name not in self.composite_tables:
                        for column in columns:
                            self.tables[table_name].rename(columns={column: f'{column} <{table_name}>'}, inplace=True)


    def _replace_with_autonumbered_keys(self):
        """Replace composite keys with auto-incremented integer keys."""
        # Create a unique mapping for composite keys to auto-incremented keys
        key_counter = 1  # Start the key counter
        for table_name, composite_key_columns in self.composite_keys.items():
            for composite_key_column in composite_key_columns:
                unique_keys = self.tables[table_name][composite_key_column].unique()

                for key in unique_keys:
                    if key not in self.key_mapping:
                        self.key_mapping[key] = key_counter
                        key_counter += 1

                # Replace the composite keys with the auto-incremented keys
                self.tables[table_name][composite_key_column] = self.tables[table_name][composite_key_column].map(self.key_mapping)


