"""Parse micromeritics xls output files.

@author Chris Murdock
@modified Paul Iacomi
"""

import re
from itertools import product

import dateutil.parser
import xlrd

from pygaps import logger
from pygaps.utilities.exceptions import ParsingError

_NUMBER_REGEX = re.compile(r"^(-)?\d+(.|,)?\d+")
_BRACKET_REGEX = re.compile(r"(?<=\().+?(?=\))")

_META_DICT = {
    'material': {
        'text': ('sample:', 'echantillon:'),
        'type': 'string',
        "xl_ref": (0, 1),
    },
    'adsorbate': {
        'text': ('analysis ads', ),
        'type': 'string',
        "xl_ref": (0, 1),
    },
    'temperature': {
        'text': ('analysis bath', ),
        'type': 'number',
        "xl_ref": (0, 1),
    },
    'operator': {
        'text': ('operator', 'analyste'),
        'type': 'string',
        "xl_ref": (0, 1),
    },
    'date': {
        'text': ('started', ),
        'type': 'string',
        "xl_ref": (0, 1),
    },
    'material_mass': {
        'text': ('sample mass', ),
        'type': 'number',
        "xl_ref": (0, 1),
    },
    'apparatus': {
        'text': ('micromeritics instrument', ),
        'type': 'string',
        "xl_ref": (1, 0),
    },
    'comment': {
        'text': ('comments', ),
        'type': 'string',
        "xl_ref": (0, 0),
    },
    'error': {
        'text': ('primary data', ),
        'type': 'error',
        "xl_ref": (1, 0),
    },
}

_DATA_DICT = {
    'absolute': 'pressure',
    'relative': 'pressure_relative',
    'saturation': 'pressure_saturation',
    'quantity': 'loading',
    'elapsed': 'time',
}

_UNITS_DICT = {
    "p": {
        "torr": ('mmHg', 'torr'),
        "kPa": ('kPa'),
        "bar": ('bar'),
        "mbar": ('mbar'),
    },
    "l": {
        "mmol": ("mmol"),
        "mol": ("mol"),
        "cm3(STP)": ("ml(STP)", "cm3(STP)", "cm^3(STP)", "cm³"),
    },
    "m": {
        "g": ("g", "g-1", "g STP", "kg STP", "g^-1"),
    },
}


def parse(path):
    """
    Parse an xls file generated by micromeritics software.

    Parameters
    ----------
    path: str
        the location of an xls file generated by a micromeritics instrument.

    Returns
    -------
    dict
        A dictionary containing report information.

    """
    meta = {}
    data = {}
    errors = []

    workbook = xlrd.open_workbook(path, encoding_override='latin-1')
    sheet = workbook.sheet_by_index(0)

    for row, col in product(range(sheet.nrows), range(sheet.ncols)):
        cell_value = str(sheet.cell(row, col).value).lower()

        if cell_value not in ["isotherm tabular report"]:
            try:
                name = next(
                    k for k, v in _META_DICT.items()
                    if any(cell_value.startswith(n) for n in v.get('text', []))
                )
            except StopIteration:
                continue

            ref = _META_DICT[name]['xl_ref']
            tp = _META_DICT[name]['type']
            val = sheet.cell(row + ref[0], col + ref[1]).value

            if tp == 'number':
                meta[name] = _handle_numbers(val, name)
            elif tp == 'string':
                meta[name] = _handle_string(val)
            elif tp == 'error':
                errors += _get_errors(sheet, row, col)

        else:  # If "data" section

            header_list = _get_data_labels(sheet, row, col)
            head, units = _parse_header(header_list)  # header
            meta.update(units)

            for i, h in enumerate(head[1:]):
                points = _get_datapoints(sheet, row, col + i)

                if h == 'time':
                    data[h] = _convert_time(points)[1:]
                elif h == 'pressure_saturation':
                    data[h] = points[1:]
                else:
                    data[h] = points

    if errors:
        meta['errors'] = errors

    _check(meta, data, path)

    # Set extra metadata
    try:
        meta['date'] = dateutil.parser.parse(meta['date']).isoformat()
    except BaseException:
        logger.warning("Could not convert date.")
    meta['pressure_mode'] = 'absolute'
    meta['loading_basis'] = 'molar'
    meta['material_basis'] = 'mass'

    return meta, data


def _handle_numbers(val, name):
    """
    Remove any extra information (such as units) to return only the number as a float.

    Input is a cell of type 'number'.
    """
    if val:
        ret = float(_NUMBER_REGEX.search(val.replace(',', '')).group())
        if name == 'temperature':
            if '°C' in val:
                ret = ret + 273.15
        return ret


def _handle_string(val):
    """
    Replace Comments: and any newline found.

    Input is a cell of type 'string'.
    """
    return val.replace('Comments: ', '').replace('\r\n', ' ')


def _convert_time(points):
    """Convert time points from HH:MM format to minutes."""
    minutes = []
    for point in points:
        hours, mins = str(point).split(':')
        minutes.append(int(hours) * 60 + int(mins))
    return minutes


def _get_data_labels(sheet, row, col):
    """Locate all column labels for data collected during the experiment."""
    final_column = col
    header_row = 2
    # Abstract this sort of thing
    header = sheet.cell(row + header_row, final_column).value.lower()
    while any(header.startswith(label) for label in _DATA_DICT.keys()):
        final_column += 1
        header = sheet.cell(row + header_row, final_column).value.lower()

    if col == final_column:
        # this means no header exists, can happen in some older files
        # the units might not be standard! TODO should check
        logger.warning("Default data headers supplied for file.")
        return [
            "Relative Pressure (P/Po)",
            "Absolute Pressure (kPa)",
            "Quantity Adsorbed (cm³/g STP)",
            "Elapsed Time (h:min)",
            "Saturation Pressure (kPa)",
        ]

    return [sheet.cell(row + header_row, i).value for i in range(col, final_column)]


def _get_datapoints(sheet, row, col):
    """Return all collected data points for a given column."""
    rowc = 3
    # Data can start on two different rows. Try first option and then next row.
    if sheet.cell(row + rowc, col).value:
        start_row = row + rowc
        final_row = row + rowc
    else:
        start_row = row + (rowc + 1)
        final_row = row + (rowc + 1)
    point = sheet.cell(final_row, col).value
    while point:
        final_row += 1
        point = sheet.cell(final_row, col).value
        # sometimes 1-row gaps are left for P0 measurement
        if not point:
            final_row += 1
            point = sheet.cell(final_row, col).value
    return [
        sheet.cell(i, col).value for i in range(start_row, final_row) if sheet.cell(i, col).value
    ]


def _parse_header(header_split):
    """Parse an adsorption/desorption header to get columns and units."""
    headers = ['branch']
    units = {}

    for h in header_split:
        header = next((_DATA_DICT[a] for a in _DATA_DICT if h.lower().startswith(a)), h)
        headers.append(header)

        if header in 'loading':
            unit = _BRACKET_REGEX.search(h).group().strip()
            unit_l, unit_m = unit.split('/')

            units['loading_basis'] = 'molar'
            units['loading_unit'] = _parse_unit(unit_l, 'l')

            units['material_basis'] = 'mass'
            units['material_unit'] = _parse_unit(unit_m, 'm')

        elif header == 'pressure':
            unit = _BRACKET_REGEX.search(h).group().strip()

            units['pressure_mode'] = 'absolute'
            units['pressure_unit'] = _parse_unit(unit, 'p')

    if 'pressure' not in headers:
        if 'pressure_relative' in headers:
            headers[headers.index('pressure_relative')] = 'pressure'
            units['pressure_mode'] = 'relative'

    return headers, units


def _parse_unit(unit, unit_type):
    for (k, v) in _UNITS_DICT[unit_type].items():
        if unit in v:
            return k
    raise ParsingError(f"Could not parse unit '{unit}'.")


def _get_errors(sheet, row, col):
    """
    Look for all cells that contain errors.
    (are below a cell labelled primary data).
    """
    ref = _META_DICT['error']['xl_ref']
    val = sheet.cell(row + ref[0], col + ref[1]).value
    if not val:
        return []
    final_row = row + ref[0]
    error = sheet.cell(final_row, col + ref[1]).value
    while error:
        final_row += 1
        error = sheet.cell(final_row, col + ref[1]).value
    return [sheet.cell(i, col + ref[1]).value for i in range(row + ref[0], final_row)]


def _check(meta, data, path):
    """
    Check keys in data and logs a warning if a key is empty.
    Also logs a warning for errors found in file.
    """
    if 'loading' in data:

        # Some files use an odd format
        # We instead remove unreadable values
        dels = []
        for k, v in data.items():
            if not v:
                logger.info(f'No data collected for {k} in file {path}.')

            if len(v) != len(data['pressure']):
                dels.append(k)

        for d in dels:
            del data[d]

    if 'errors' in meta:
        logger.warning('Report file contains warnings:')
        logger.warning('\n'.join(meta['errors']))
