import base64
import gzip
import io
import mimetypes
import re
from typing import List, Optional

import fitz
import requests
from PIL import Image
from fitz import Pixmap
from pydantic import BaseModel
from requests_toolbelt import MultipartEncoder

from kikyopp.utils.constants import YURI_API_HOST, TIKA_HOST, PADDLEOCR_HOST, PDF_CONVERTER_HOST, FILE_CONTENT_LIMIT, \
    RETRY_API_TIMES
from kikyopp.utils.retry import retry_rest_api


def pdf_to_image(data: bytes, limit: Optional[int] = None, format: str = 'png', **params):
    result = []
    pdf = fitz.Document(stream=io.BytesIO(data), filetype='pdf')
    for i, page in enumerate(pdf):
        # 将每一页pdf读取为图片
        img: Pixmap = page.getPixmap()
        img_bytes = img.tobytes()
        if format == 'png':
            result.append(img_bytes)
        else:
            t = Image.open(io.BytesIO(img_bytes))
            o = io.BytesIO()
            t.save(o, format=format, **params)
            o.seek(0)
            result.append(o.read())
        if limit is not None and len(result) >= limit:
            break
    return result


def pdf_to_png(data: bytes, limit: Optional[int] = None) -> List[bytes]:
    return pdf_to_image(data, limit=limit)


@retry_rest_api(RETRY_API_TIMES)
def download_file_from_bus(file_id: str) -> bytes:
    resp = requests.get(
        f'{YURI_API_HOST}/filebus/download',
        params={
            'file_id': file_id,
        },
        timeout=20,
    )
    resp.raise_for_status()
    return resp.content


@retry_rest_api(RETRY_API_TIMES)
def download_by_file_link(file_link: str) -> bytes:
    resp = requests.get(
        file_link,
        timeout=20,
    )
    resp.raise_for_status()
    return resp.content


@retry_rest_api(RETRY_API_TIMES)
def extract_content(source: bytes) -> Optional[str]:
    resp = requests.put(
        f'{TIKA_HOST}/tika',
        data=source,
        timeout=20,
        headers={'Accept': 'text/plain'}
    )
    resp.raise_for_status()
    return resp.content.decode('utf-8')


def extract_content_by_ocr(source: bytes) -> Optional[str]:
    res = ocr(source)
    t = []
    for r in res:
        t.append(r['text'])
    return ''.join(t)


@retry_rest_api(RETRY_API_TIMES)
def ocr(source: bytes) -> Optional[List[dict]]:
    image = base64.b64encode(source).decode('utf8')
    data = {"images": [image]}
    resp = requests.post(
        url=f'{PADDLEOCR_HOST}/predict/ocr_system',
        json=data,
        timeout=20,
    )
    resp.raise_for_status()
    res = resp.json()['results'][0]
    return res


content_file_ext = {
    'doc': 'application/msword',
    'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
    'pdf': 'application/pdf',
    'ppt': 'application/vnd.ms-powerpoint',
    'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
    'xls': 'application/vnd.ms-excel',
    'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
}


@retry_rest_api(RETRY_API_TIMES)
def doc_to_pdf(source: bytes, filename: str = None) -> Optional[bytes]:
    data = None
    if filename:
        ext = filename.rsplit('.', maxsplit=1)[-1]
        if ext in content_file_ext:
            data = (f'data.{ext}', io.BytesIO(source), content_file_ext[ext])

    if data is None:
        return

    payload = MultipartEncoder({'data': data})
    resp = requests.post(
        f'{PDF_CONVERTER_HOST}/lool/convert-to/pdf',
        data=payload,
        headers={'Content-Type': payload.content_type},
        timeout=20,
    )
    resp.raise_for_status()
    return resp.content


@retry_rest_api(RETRY_API_TIMES)
def extract_file_meta(source: bytes) -> dict:
    resp = requests.put(
        f'{TIKA_HOST}/meta',
        data=source,
        headers={'Accept': 'application/json'},
        timeout=20,
    )
    resp.raise_for_status()
    return resp.json()


class FileInspection(BaseModel):
    file_bytes: bytes = None
    file_ext: str = None
    content_type: str = None
    content_encoding: str = None


def inspect_file(file_bytes: bytes) -> FileInspection:
    meta = extract_file_meta(file_bytes)

    content_type = meta.get('Content-Type')
    if content_type:
        content_type = content_type.split(';')[0]

    if content_type == 'application/gzip':
        file_bytes = gzip.decompress(file_bytes)
        meta = extract_file_meta(file_bytes)
        content_type = meta.get('Content-Type')
        if content_type:
            content_type = content_type.split(';')[0]

    content_encoding = meta.get('Content-Encoding')
    if content_type is not None:
        file_ext: Optional[str] = mimetypes.guess_extension(content_type)
        if file_ext:
            file_ext = file_ext[1:]
    else:
        file_ext = None

    return FileInspection(
        file_bytes=file_bytes,
        file_ext=file_ext,
        content_type=content_type,
        content_encoding=content_encoding,
    )


blank_reg = re.compile(r'\s+')
word_blank_reg = re.compile(r'([\u4E00-\u9FA5])\s+([\u4E00-\u9FA5])')


def normalize_file_content(content: str) -> str:
    content = blank_reg.sub(' ', content)
    content = word_blank_reg.sub(r'\1\2', content)
    content = word_blank_reg.sub(r'\1\2', content)

    if len(content) > FILE_CONTENT_LIMIT:
        content = content[:FILE_CONTENT_LIMIT]
    return content
