"""
File Conversion Classes
"""
import contextlib
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Union, List, Tuple, Iterator, TYPE_CHECKING


from fileio.core.types import File, FileLike
from fileio.types.etc import ParsedFile, PreparedFile
from fileio.utils import logger
from fileio.utils.pooler import ThreadPooler
from fileio.utils.configs import settings
from fileio.utils.ops import async_checksum_file, checksum_file

from .static import KNOWN_LIGATURES

_tqdm_avail: bool = False
with contextlib.suppress(ImportError):
    from tqdm.auto import tqdm
    __tqdm_avail = True
    import langdetect

if TYPE_CHECKING:
    with contextlib.suppress(ImportError):
        from starlette.requests import Request
        from starlette.datastructures import UploadFile

class BaseConverter(ABC):
    """
    Base class for implementing file converts to transform input documents to text format for ingestion in Elastic.
    """

    method: Optional[str] = None
    enabled: Optional[bool] = None
    progress_bar: Optional[bool] = (settings.enable_progress_bar and _tqdm_avail)
    raise_errors: Optional[bool] = True

    def __init__(
        self,
        remove_numeric_tables: bool = False,
        valid_languages: Optional[List[str]] = None,
        id_hash_keys: Optional[List[str]] = None,
        progress_bar: Optional[bool] = None,
        raise_errors: Optional[bool] = None,
        **kwargs,
    ):
        """
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        :param progress_bar: Show a progress bar for the conversion.
        """
        super().__init__()

        self.remove_numeric_tables = remove_numeric_tables
        self.valid_languages = valid_languages
        self.id_hash_keys = id_hash_keys
        if progress_bar is not None: self.progress_bar = progress_bar
        if raise_errors is not None: self.raise_errors = raise_errors

        self._kwargs = kwargs
        self.validate_enabled()
    
    @classmethod
    def set_progress_bar(cls, value: bool):
        cls.progress_bar = value

    @classmethod
    def check_enabled(cls, **kwargs):
        """
        Check if the converter is enabled
        """
        return True

    @classmethod
    def validate_enabled(cls):
        """
        Validate if the converter is enabled
        """
        if cls.enabled is None:
            cls.enabled = cls.check_enabled()
        if not cls.enabled:
            if cls.raise_errors:
                raise NotImplementedError(f"Converter {cls.__name__} is not enabled with {cls.method}")
            logger.warning(f"Converter {cls.__name__} is not enabled with {cls.method}")

    def extract_file_metadata(
        self,
        file_path: 'FileLike',
        **kwargs,
    ) -> Optional[Dict[str, Any]]:
        """
        Extract file metadata like name, size, etc.
        """
        return {}

    async def async_extract_file_metadata(
        self,
        file_path: 'FileLike',
        **kwargs,
    ) -> Optional[Dict[str, Any]]:
        """
        Extract file metadata like name, size, etc.
        """
        return await ThreadPooler.run_async(self.extract_file_metadata, file_path, **kwargs)

    def prepare_file(
        self,
        file: 'FileLike',
        **kwargs,
    ) -> 'PreparedFile':
        """
        Prepares a single file for extraction
        """
        file = File(file)
        if not file.exists():
            if self.raise_errors: raise FileNotFoundError(f"File {file} does not exist")
            logger.warning(f"File {file} does not exist")
            return None
        cksum = checksum_file(file)
        kws = {
            'local_path': None if file.is_cloud else file,
            'remote_path': file if file.is_cloud else None,
            'is_tmp': 'tmp' in file.as_posix(),
            'checksum': cksum,
        }
        return PreparedFile.parse_obj(kws)

    async def async_prepare_file(
        self,
        file: 'FileLike',
        **kwargs,
    ) -> 'PreparedFile':
        """
        Prepares a single file for extraction
        """
        file = File(file)
        if not await file.async_exists():
            if self.raise_errors: raise FileNotFoundError(f"File {file} does not exist")
            logger.warning(f"File {file} does not exist")
            return None
        cksum = await async_checksum_file(file)
        kws = {
            'local_path': None if file.is_cloud else file,
            'remote_path': file if file.is_cloud else None,
            'is_tmp': 'tmp' in file.as_posix(),
            'checksum': cksum,
        }
        return PreparedFile.parse_obj(kws)


    async def async_prepare_files(
        self,
        files: Union['FileLike', List['FileLike']] = None,
        request: Optional['Request'] = None, 
        upload_files: Optional[Union[List['UploadFile'], 'UploadFile']] = None, 
        upload_form_keys: Optional[Union[List[str], str]] = None,
        remote_files: Optional[Union[List[str], str]] = None,
        remote_form_keys: Optional[List[str]] = None,
        **kwargs
    ) -> List['PreparedFile']:
        """
        Prepare files for conversion
        """
        if files is not None:
            if not isinstance(files, list): files = [files]
            files = [File(f) for f in files]
            prepared_files = []
            for file in files:
                if not await file.async_exists():
                    if self.raise_errors: raise FileNotFoundError(f"File {file} does not exist")
                    logger.warning(f"File {file} does not exist")
                    continue

                cksum = await async_checksum_file(file)
                kws = {
                    'local_path': None if file.is_cloud else file,
                    'remote_path': file if file.is_cloud else None,
                    'is_tmp': 'tmp' in file.as_posix(),
                    'checksum': cksum,
                }
                prepared_files.append(PreparedFile.parse_obj(kws))
        else:
            prepared_files = await PreparedFile.async_from_request(
                request=request,
                upload_files=upload_files,
                upload_form_keys=upload_form_keys,
                remote_files=remote_files,
                remote_form_keys=remote_form_keys,
                **kwargs
            )

        if not isinstance(prepared_files, list): prepared_files = [prepared_files]
        return prepared_files
    

    @abstractmethod
    def convert(
        self,
        file: Union['PreparedFile', 'FileLike'],
        metadata: Optional[Dict[str, Any]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        **kwargs,
    ) -> List[ParsedFile]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.
        File converters may extract file meta like name or size. In addition to it, user
        supplied meta data like author, url, external IDs can be supplied as a dictionary.
        :param file: path of the file to convert
        :param metadata: dictionary of meta data key-value pairs to append in the returned document.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Select the file encoding (default is `UTF-8`)
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        """
        pass

    
    async def async_convert(
        self,
        file: Union['PreparedFile', 'FileLike'],
        metadata: Optional[Dict[str, Any]] = None,
        remove_numeric_tables: Optional[bool] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        **kwargs,
    ) -> List[ParsedFile]:
        """
        Convert a file to a dictionary containing the text and any associated meta data.
        File converters may extract file meta like name or size. In addition to it, user
        supplied meta data like author, url, external IDs can be supplied as a dictionary.
        :param file_path: path of the file to convert
        :param meta: dictionary of meta data key-value pairs to append in the returned document.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Select the file encoding (default is `UTF-8`)
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        """
        return await ThreadPooler.run_async(self.convert, file = file,  encoding = encoding, id_hash_keys = id_hash_keys, metadata = metadata, remove_numeric_tables = remove_numeric_tables, valid_languages = valid_languages, **kwargs)

    def validate_language(self, text: str, valid_languages: Optional[List[str]] = None) -> bool:
        """
        Validate if the language of the text is one of valid languages.
        """
        if valid_languages is None: valid_languages = self.valid_languages
        if not valid_languages: return True
        try: lang = langdetect.detect(text)
        except langdetect.lang_detect_exception.LangDetectException: lang = None
        return lang in valid_languages
    
    def get_iterator(self, files: List['PreparedFile'], metadata: List[Dict[str, Any]]) -> Iterator[Tuple['PreparedFile', Dict[str, Any]]]:
        """
        Get an iterator over the files and metadata.
        """
        if self.progress_bar:
            return tqdm(
                zip(files, metadata), total=len(files), disable = not self.progress_bar, desc="Converting files"
            )
        return zip(files, metadata)
        

    def run(  # type: ignore
        self,
        files: Union['FileLike', 'PreparedFile', List['FileLike'], List['PreparedFile']],
        metadata: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
        remove_numeric_tables: Optional[bool] = None,
        known_ligatures: Optional[Dict[str, str]] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        extract_metadata: Optional[bool] = None,
        **kwargs,
    ) -> List[ParsedFile]:
        """
        Extract text from a file.
        :param file_paths: Path to the files you want to convert
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                     Can be any custom keys and values.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param known_ligatures: Some converters tend to recognize clusters of letters as ligatures, such as "ﬀ" (double f).
                                Such ligatures however make text hard to compare with the content of other files,
                                which are generally ligature free. Therefore we automatically find and replace the most
                                common ligatures with their split counterparts. The default mapping is in
                                `haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
                                but excludes all ligatures that are known to be used in IPA.
                                If no value is provided, this default is created and used.
                                You can use this parameter to provide your own set of ligatures to clean up from the documents.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Select the file encoding (default is `UTF-8`)
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        """
        if known_ligatures is None: known_ligatures = KNOWN_LIGATURES
        if not isinstance(files, list): files = [files]
        prepared_files = [self.prepare_file(file) for file in files]

        if isinstance(metadata, dict) or metadata is None: metadata = [metadata] * len(prepared_files)
        parsed_files: List[ParsedFile] = []

        for prepared_file, file_meta in self.get_iterator(prepared_files, metadata):
            parsed_files.extend(
                self.convert(
                    file=prepared_file,
                    meta=file_meta,
                    remove_numeric_tables=remove_numeric_tables,
                    valid_languages=valid_languages,
                    encoding=encoding,
                    id_hash_keys=id_hash_keys,
                    extract_metadata=extract_metadata,
                    **kwargs,
                )
            )

        # Cleanup ligatures
        for parsed_file in parsed_files:
            for ligature, letters in known_ligatures.items():
                if parsed_file.content is not None: parsed_file.content = parsed_file.content.replace(ligature, letters)
        return parsed_files
    
    async def async_run(  # type: ignore
        self,
        files: Union['FileLike', 'PreparedFile', List['FileLike'], List['PreparedFile']],
        metadata: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
        remove_numeric_tables: Optional[bool] = None,
        known_ligatures: Optional[Dict[str, str]] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        extract_metadata: Optional[bool] = None,
        **kwargs,
    ) -> List[ParsedFile]:
        """
        Extract text from a file.
        :param file_paths: Path to the files you want to convert
        :param meta: Optional dictionary with metadata that shall be attached to all resulting documents.
                     Can be any custom keys and values.
        :param remove_numeric_tables: This option uses heuristics to remove numeric rows from the tables.
                                      The tabular structures in documents might be noise for the reader model if it
                                      does not have table parsing capability for finding answers. However, tables
                                      may also have long strings that could possible candidate for searching answers.
                                      The rows containing strings are thus retained in this option.
        :param known_ligatures: Some converters tend to recognize clusters of letters as ligatures, such as "ﬀ" (double f).
                                Such ligatures however make text hard to compare with the content of other files,
                                which are generally ligature free. Therefore we automatically find and replace the most
                                common ligatures with their split counterparts. The default mapping is in
                                `haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths
                                but excludes all ligatures that are known to be used in IPA.
                                If no value is provided, this default is created and used.
                                You can use this parameter to provide your own set of ligatures to clean up from the documents.
        :param valid_languages: validate languages from a list of languages specified in the ISO 639-1
                                (https://en.wikipedia.org/wiki/ISO_639-1) format.
                                This option can be used to add test for encoding errors. If the extracted text is
                                not one of the valid languages, then it might likely be encoding error resulting
                                in garbled text.
        :param encoding: Select the file encoding (default is `UTF-8`)
        :param id_hash_keys: Generate the document id from a custom list of strings that refer to the document's
            attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are
            not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]).
            In this case the id will be generated by using the content and the defined metadata.
        """
        if known_ligatures is None: known_ligatures = KNOWN_LIGATURES
        if not isinstance(files, list): files = [files]
        prepared_files = [await self.async_prepare_file(file) for file in files]

        if isinstance(metadata, dict) or metadata is None: metadata = [metadata] * len(prepared_files)
        parsed_files: List[ParsedFile] = []

        for prepared_file, file_meta in self.get_iterator(prepared_files, metadata):
            parsed_files.extend(
                await self.async_convert(
                    file=prepared_file,
                    meta=file_meta,
                    remove_numeric_tables=remove_numeric_tables,
                    valid_languages=valid_languages,
                    encoding=encoding,
                    id_hash_keys=id_hash_keys,
                    extract_metadata=extract_metadata,
                    **kwargs,
                )
            )

        # Cleanup ligatures
        for parsed_file in parsed_files:
            for ligature, letters in known_ligatures.items():
                if parsed_file.content is not None: parsed_file.content = parsed_file.content.replace(ligature, letters)
        return parsed_files
    

    def run_batch(  # type: ignore
        self,
        files: Union['FileLike', 'PreparedFile', List['FileLike'], List['PreparedFile']],
        metadata: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
        remove_numeric_tables: Optional[bool] = None,
        known_ligatures: Optional[Dict[str, str]] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        extract_metadata: Optional[bool] = None,
        **kwargs,
    ) -> List[ParsedFile]:
        return self.run(
            files=files,
            metadata=metadata,
            remove_numeric_tables=remove_numeric_tables,
            known_ligatures=known_ligatures,
            valid_languages=valid_languages,
            encoding=encoding,
            id_hash_keys=id_hash_keys,
            extract_metadata=extract_metadata,
            **kwargs,
        )
    
    async def async_run_batch(  # type: ignore
        self,
        files: Union['FileLike', 'PreparedFile', List['FileLike'], List['PreparedFile']],
        metadata: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
        remove_numeric_tables: Optional[bool] = None,
        known_ligatures: Optional[Dict[str, str]] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        extract_metadata: Optional[bool] = None,
        **kwargs,
    ) -> List[ParsedFile]:
        return await self.async_run(
            files=files,
            metadata=metadata,
            remove_numeric_tables=remove_numeric_tables,
            known_ligatures=known_ligatures,
            valid_languages=valid_languages,
            encoding=encoding,
            id_hash_keys=id_hash_keys,
            extract_metadata=extract_metadata,
            **kwargs,
        )
    

    async def async_from_request(
        self,
        request: Optional['Request'] = None, 
        upload_files: Optional[Union[List['UploadFile'], 'UploadFile']] = None, 
        upload_form_keys: Optional[Union[List[str], str]] = None,
        remote_files: Optional[Union[List[str], str]] = None,
        remote_form_keys: Optional[List[str]] = None,

        metadata: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None,
        remove_numeric_tables: Optional[bool] = None,
        known_ligatures: Optional[Dict[str, str]] = None,
        valid_languages: Optional[List[str]] = None,
        encoding: Optional[str] = "UTF-8",
        id_hash_keys: Optional[List[str]] = None,
        extract_metadata: Optional[bool] = None,
        **kwargs
    ) -> List[ParsedFile]:
        """
        Parse a request and extract text from the files.
        """
        prepared_files = await PreparedFile.async_from_request(
            request=request,
            upload_files=upload_files,
            upload_form_keys=upload_form_keys,
            remote_files=remote_files,
            remote_form_keys=remote_form_keys,
            **kwargs
        )
        return await self.async_run(
            files=prepared_files,
            metadata=metadata,
            remove_numeric_tables=remove_numeric_tables,
            known_ligatures=known_ligatures,
            valid_languages=valid_languages,
            encoding=encoding,
            id_hash_keys=id_hash_keys,
            extract_metadata=extract_metadata,
            **kwargs,
        )




