"""arXiv TeX produced routines.

This package checks a submission file object to determine whether it was generated
from TeX. arXiv requires the TeX source, when available, to make long-term maintenance
of papers easier. Single-file submission determined to be generated by TeX are rejected.

The tex-produced check current works on both streams and files for PDFs and just files
for Postscript files. The original code only worked with files. In NG we pass around streams
of file content which would require writing out in order to user the file-based check.

We now convert the non-seekable stream to an in-memory object and use the PyPDF2 package
to extract info and fonts information from a PDF.

There are then a specific set of information we look for in the PDF info and certain
fonts that indicate the content was generated by some form of TeX engine.

"""
import io
import re
import subprocess
import os
import tempfile

from typing import IO, List
from arxiv.base import logging  # type: ignore

from PyPDF2 import PdfFileReader, utils

logger = logging.getLogger(__name__)

verbose = 0


def get_filtered_pdf_info_from_stream(stream: IO[bytes]) -> List[str]:
    """
    Returns select set of values from pdfinfo output for specified file.

    Parameters
    ----------
    stream: IO[bytes
        The stream to run pdfinfo on.

    Returns
    -------
    List containing one entry for each selected line from pdfinfo output.

    """
    info_list = []

    try:
        pdf = PdfFileReader(stream)
        information = pdf.getDocumentInfo()

        if information.creator is not None:
            info_list.append(bytes('Creator:        ' + information.creator, 'utf-8'))

        if information.title is not None:
            info_list.append(bytes('Title:          ' + information.title, 'utf-8'))

        if information.producer is not None:
            info_list.append(bytes('Producer:       ' + information.producer, 'utf-8'))
        return info_list
    except utils.PdfReadError as ex:
        logger.error(f"get_filtered_pdf_info_from_stream: Can't read PDF: Error: {ex}")
        raise ex
    except Exception as ex:
        logger.error(f"Error: Unknown {ex}")
        raise ex

    return info_list


# This find-font code [walk() and parts of get_pdf_fonts_from_stream()] is
# taken from the web and originally written by Tim Arnold.
# I have modified it for our needs. DLF2

def walk(obj, fnt, emb):
    '''
    If there is a key called 'BaseFont', that is a font that is used in the document.
    If there is a key called 'FontName' and another key in the same dictionary object
    that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
    embedded.

    We create and add to two sets, fnt = fonts used and emb = fonts embedded.
    '''
    if not hasattr(obj, 'keys'):
        return None, None
    fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
    if '/BaseFont' in obj:
        fnt.add(bytes(obj['/BaseFont'], 'utf-8').strip(b"/"))
    if '/FontName' in obj:
        if [x for x in fontkeys if x in obj]:  # test to see if there is FontFile
            emb.add(obj['/FontName'])

    for k in obj.keys():
        walk(obj[k], fnt, emb)

    return fnt, emb  # return the sets for each page


def get_pdf_fonts_from_stream(stream: IO[bytes]) -> List[str]:
    """
    Return the list of fonts for specified file.

    Parameters
    ----------
    stream : IO[bytes]
        The stream of file bytes to run pdffonts on.

    Returns
    -------
    List containing one entry for each line of output from pdffonts command.

    """
    fonts_list = []

    fonts = set()
    embedded = set()

    try:
        pdf = PdfFileReader(stream)

        for page in pdf.pages:
            obj = page.getObject()
            f, e = walk(obj['/Resources'], fonts, embedded)
            fonts = fonts.union(f)
            embedded = embedded.union(e)

        # I don't believe we need the embedded fonts but I'm leaving
        # it here for now. This might be useful when we check whether a file
        # embeds standard system fonts.
        #unembedded = fonts - embedded

    except utils.PdfReadError as ex:
        logger.error(f"get_pdf_fonts_from_stream: Can't read PDF: Error: {ex}")
        raise ex
    except Exception as ex:
        logger.error(f"Error: {ex}")
        raise ex

    return sorted(list(fonts))

# Regexes to help us parse list of fonts from pdffonts.

regex_header = re.compile(br"^\-\-\-\-", re.IGNORECASE)
regex_fnt = re.compile(br"^.*\+[^\ ]*|^.*\-[^\ ]*", re.IGNORECASE)


def get_pdf_fonts_from_file(filepath: str) -> List[str]:
    """
    Return the list of fonts for specified file.

    Parameters
    ----------
    filepath : str
        The file to run pdffonts on.

    Returns
    -------
    List containing one entry for each line of output from pdffonts command.

    """
    fonts_list = []

    info = subprocess.run(["pdffonts", f"{filepath}"], stderr=subprocess.PIPE,
                          stdout=subprocess.PIPE)

    lines = info.stdout.splitlines()

    # Clean up the output by extracting font names
    for line in lines:
        if regex_header.search(line):
            continue
        match = regex_fnt.search(line)
        if match:
            fonts_list.append(match[0])

    return sorted(fonts_list)


# Regular expressions that indicate TeX Produced (for pdfinfo)

regex1 = re.compile(br"^Creator.*dvips", re.IGNORECASE)
regex2 = re.compile(br"^Title.*\.dvi", re.IGNORECASE)
regex3 = re.compile(br"^(Creator|Producer).*TeX(?!(t|macs-1\.))", re.IGNORECASE)
regex4 = re.compile(br"^(Creator|Producer).*dvipdfmx?", re.IGNORECASE)
regex5 = re.compile(br"^(Creator|Producer).*cairo.*$", re.IGNORECASE)

regexes = [regex1, regex2, regex3, regex4, regex5]

# Ignore this case where wisywig sotware uses TeX for back end processing.
regexTeXmacs = re.compile(br"^(Creator|Producer).*TeXmacs-1\.", re.IGNORECASE)

# PDFs using Cairo or CMR/RTX fonts were generated by TeX
fontsrgex = re.compile(br"^CairoFont\-\d-\d|^\S+\+(?:cmr|rtx)", re.IGNORECASE)


# PDF
def check_tex_produced_pdf_from_stream(stream: IO[bytes]) -> bytes:
    """
    Check whether specified PDF file was produced by TeX.

    Parameters
    ----------
    stream : IO[bytes]
        The stream of file bytes for PDF to be checked for TeX produced.

    Returns
    -------
        String (regex match) if PDF is TeX-produced, otherwise return ''.

    """
    info = []
    try:
        info = get_filtered_pdf_info_from_stream(stream)
    except Exception as ex:
        logger.error(f'Error getting pdfinfo: {ex}')

    if verbose:
        logger.debug(f"\nInfo: '{info}' from PDF")

    for regex in regexes:
        ret = [m.group(0) for i in info for m in [regex.search(i)] if m]
        if ret:
            return ret[0]

    # Skip fonts check when TeXmacs (uses TeX as its backend)
    ret = [m.group(0) for i in info for m in [regexTeXmacs.search(i)] if m]
    if ret:
        return b''

    # Check for TeX fonts
    fonts = []
    try:
        fonts = get_pdf_fonts_from_stream(stream)
    except Exception as ex:
        logger.error(f'Error getting pdffonts: {ex}')

    match = [m.group(0) for i in fonts for m in [fontsrgex.search(i)] if m]

    if match:
        return match[0]

    return b''


# Postscript check
#
def check_tex_produced_ps(file_path: str) -> bytes:
    """
    Check whether specified Postscript file was produced by TeX.

    Parameters
    ----------
    file_path : str
        The file path for Postscript file to be checked for TeX produced.

    Returns
    -------
    string
        Returns string (regex match) if Postscript file was produced by TeX.

    """
    if not os.path.exists(file_path):
        return b''

    TEX_GREP = fr'TeXdict|dvit?ps|ArborText|OzTeX|PCTEX|^%.VTeX|' \
               fr'^%%Creator:.*Textures|^%%Title: .*\.dvi\b'

    # TODO: Rewrite this in Python. See PR #77 ARXIVNG-2885.
    out = subprocess.run(f"head -500 {file_path} | egrep -i '{TEX_GREP}' | head -1",
                         stderr=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         shell=True)

    if out.stdout:
        return bytes(out.stdout)

    out = subprocess.run(f'head -2000 {file_path}' + '| egrep -i "[A-Z]{5,6}\\+CMR10"| head -1',
                         stderr=subprocess.PIPE,
                         stdout=subprocess.PIPE,
                         shell=True)

    if out.stdout:
        return bytes(out.stdout)

    return b''
