#!/usr/bin/env python
"""
rollbar-agent: agent to monitor log files and send notices to Rollbar
"""
import ast
import codecs
import copy
import fnmatch
import hashlib
import json
import logging
import optparse  # instead of argparse, for python2.6 compat
import os
import re
import shelve
import signal
import socket
import stat
import sys
import threading
import time

try:
    import configparser
except ImportError:
    import ConfigParser as configparser

try:
    import dbm.ndbm as dbm
except ImportError:
    import dbm

import requests

log = logging.getLogger(__name__)

VERSION = '0.5.1'

DEFAULT_ENDPOINT = 'https://api.rollbar.com/api/1/item/'
DEFAULT_TIMEOUT = 3  # in seconds
DEFAULT_DATEFMT = '%Y-%m-%d %H:%M:%S,%f'

LOG_LEVEL = {
    'notset': 0,
    'notse': 0,
    'debug': 10,
    'info': 20,
    'warning': 30,
    'warni': 30,
    'warn': 30,
    'error': 40,
    'critical': 50,
    'criti': 50,
    'crit': 50
}


## utils

terminal_character_attribute_pattern = re.compile(r'\x1b\[[0-9;]*m')


def iteritems(_dict):
    return _dict.items() if not hasattr(_dict, 'iteritems') else _dict.iteritems()


def itervalues(_dict):
    return _dict.values() if not hasattr(_dict, 'itervalues') else _dict.itervalues()


def clean_line(app_config, line):
    if app_config.get('filter_chr_attr_sequences'):
        return terminal_character_attribute_pattern.sub("", line)
    return line


def parse_timestamp(format, s):
    try:
        ts = time.mktime(time.strptime(s, format))
    except ValueError:
        # fall back to current timestamp
        ts = time.time()
    return int(ts)


def find_filenames(app_config):
    """
    Returns a list of all the filenames to process from the specified app
    """
    filenames = []
    for target in app_config['targets']:
        if os.path.isfile(target):
            if should_process_file(app_config, target):
                filenames.append(target)
        elif os.path.isdir(target):
            for relative_name in os.listdir(target):
                filename = os.path.join(target, relative_name)
                if os.path.isfile(filename) and should_process_file(app_config, filename):
                    filenames.append(filename)

    return filenames


def should_process_file(app_config, filename):
    """
    Returns True if we should process the specified filename for the specified app
    """
    if not os.path.isfile(filename):
        return False

    if filename in app_config['blacklist']:
        return False

    if filename in app_config['targets']:
        return True

    ext = filename.split('.')[-1]
    if app_config['ext_whitelist']:
        return ext in app_config['ext_whitelist']

    if ext in app_config['ext_blacklist']:
        return False

    return True


def build_python_log_format_parser(format, datefmt):
    """
    Parses a python log format string, and returns a (regex, strptimeformat) to parse it.
    """
    if not datefmt:
        datefmt = DEFAULT_DATEFMT

    python_name = r'[a-zA-Z_][a-zA-Z0-9_]*'

    # dict of key: (match_name, regex_string)
    known_keys = {
        '%(asctime)s': ('timestamp', datefmt_to_regex(datefmt)),
        '%(created)f': ('created', r'\d+\.\d+'),
        '%(filename)s': ('filename', r'\S+'),
        '%(funcName)s': ('function_name', python_name),
        '%(levelname)s': ('level', r'[a-zA-Z_]+'),
        '%(levelname)-5.5s': ('level', r'[a-zA-Z_][a-zA-Z_ ]{4}'),
        '%(levelname)-8s': ('level', r'[a-zA-Z_]([a-zA-Z_ ]{7}|[a-zA-Z_]{8,})'),
        '%(levelno)d': ('levelno', r'\d+'),
        '%(lineno)d': ('lineno', r'\d+'),
        '%(lineno)s': ('lineno', r'\d+'),
        '%(module)s': ('module_name', python_name),
        '%(message)s': ('title', '.*'),
        '%(name)s': ('name', '\S+'),
        '%(pathname)s': ('pathname', '\S+'),
        '%(process)d': ('pid', '\d+'),
        '%(processName)s': ('process_name', '\S+'),
        '%(relativeCreated)d': ('relative_created', '\d+'),
        '%(thread)d': ('thread', '\d+'),
        '%(threadName)s': ('thread_name', '\S+'),
        '%(threadName)-20s': ('thread_name', '\S+ {,19}'),
    }

    regex_str = '^' + re.escape(format) + '$'
    for key, val in iteritems(known_keys):
        search = re.escape(key)
        replacement = "(?P<%s>%s)" % val
        regex_str = regex_str.replace(search, replacement)

    log.debug("Built regex string from format %s datefmt %s => %s", format, datefmt, regex_str)

    return (re.compile(regex_str), datefmt)


def datefmt_to_regex(datefmt):
    # the below should work for en-us locale... probably won't work for all locales.
    replacements = {
        '%a': r'[a-zA-Z]+',  # locale's abbreviated weekday name
        '%A': r'[a-zA-Z]+',  # locale's full weekday name
        '%b': r'[a-zA-Z]+',  # locale's abbreviated month name
        '%B': r'[a-zA-Z]+',  # locale's full month name
        '%c': r'[a-zA-Z]+ [a-zA-Z]+ \d{1,2} \d{2}:\d{2}:\d{2} \d{4}',  # locale's appropriate date and time representation.
        '%d': r'\d{2}',  # day of the month [01,31]
        '%f': r'\d+',  # milliseconds
        '%H': r'\d{2}',  # hour (24-hour clock) [00,23]
        '%I': r'\d{2}',  # hour (12-hour clock) [01,12]
        '%j': r'\d{3}',  # day of the year as a decimal number [001,366]
        '%m': r'\d{2}',  # month [01,12]
        '%M': r'\d{2}',  # minute [00,59]
        '%p': r'[A-Z]{2}',  # locale's equivalent of either AM or PM
        '%S': r'\d{2}',  # second [00,61]
        '%U': r'\d{2}',  # week number of the year starting on Sundays [00,53]
        '%w': r'\d',  # weekday number [0,6]
        '%W': r'\d{2}', # week number of the year starting on Mondays [00,53]
        '%x': r'\d{2}/\d{2}/\d{2}',  # locale's date representation
        '%X': r'\d{2}:\d{2}:\d{2}',  # locale's time representation
        '%y': r'\d{2}',  # year without century [00,99]
        '%Y': r'\d{4}',  # year with century
        '%Z': r'[A-Z]{3}',  # time zone name
    }

    for key, val in iteritems(replacements):
        datefmt = datefmt.replace(key, val)

    # last: replace %% with %
    datefmt = datefmt.replace('%%', '%')
    return datefmt


## processors

def choose_processor(filename):
    """
    Returns the Processor subclass that should be used to process the specified file
    """
    if filename.endswith('.rollbar') or filename.endswith('.ratchet'):
        return RollbarFileProcessor
    return LogFileProcessor


class Processor(object):
    """
    Base Processor class
    """
    def __init__(self, scanner, app):
        self.scanner = scanner
        self.app = app

    def process(self, fp):
        raise NotImplementedError()

    def send_payload(self, payload):
        # do immediate http post
        # in the future, will do this with batches and single separate thread

        if sys.version_info[0] >= 3:
            payload_is_str = isinstance(payload, str)
        else:
            payload_is_str = isinstance(payload, basestring)
        if payload_is_str:
            payload = payload.encode('utf8')

        if options.dry_run:
            log.debug("Dry run; payload to send: %s", payload)
        else:
            log.debug("Sending payload: %s", payload)
            config = self.app['config']
            resp = requests.post(
                    config['endpoint'],
                    data=payload,
                    timeout=config['timeout'],
                    headers={"content-type": "application/json"})
            if resp.status_code != 200:
                log.warning("Unexpected response from Rollbar API. Code: %s Body: %s",
                    resp.status_code, resp.text)


class RollbarFileProcessor(Processor):
    """
    Processor for .rollbar files
    Each line is a json-encoded payload, so all we have to do is decode it and send it.
    """
    def process(self, fp, filename, state):
        for line in fp:
            log.debug("Read line. Length: %d Hash: %s", len(line), hashlib.md5(line.encode('utf8')).hexdigest())
            self._process_line(line)

    def _process_line(self, line):
        line = line.strip()
        if not line:
            log.debug("Skipping empty line")
            return
        try:
            payload = json.loads(line)

            # use the agent's access token if one isn't defined for the loaded item
            if not payload.get('access_token'):
                payload['access_token'] = self.app['config']['params.access_token']
                line = json.dumps(payload)

        except ValueError:
            log.warning("Could not process badly formatted line: %s", line)
            return

        self.send_payload(line)


class LogFileProcessor(Processor):
    """
    Processor for general log files - currently works reasonably well for paste/pylons log files.
    Some events we want will span multiple lines.
    """
    # dictionary of (pattern, strptime format and name)
    _default_message_start_parser = {
        'regex': re.compile(r'(?P<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),(\d+)?\s+(?P<level>[A-Z]+)\s+(?P<title>.*$)'),
        'datefmt': '%Y-%m-%d %H:%M:%S',
        'name': 'default parser'
    }

    def __init__(self, *args, **kw):
        super(LogFileProcessor, self).__init__(*args, **kw)
        self._init_formatters()

    def _init_formatters(self):
        default_pattern_name = self.app['config']['log_format.default']
        self.default_parser = self.scanner.config['_formats'].get(default_pattern_name,
            self._default_message_start_parser)

        # ordered list of tuples, in descending order of priority.
        self.pattern_parsers = []

        patterns_config = self.app['config']['log_format.patterns'].split('\n')
        for pair in patterns_config:
            pair = pair.strip()
            if not pair:
                continue
            # Using None as the separator means
            # "runs of consecutive whitespace are regarded as a single separator"
            pattern, format = pair.split(None, 1)
            if format in self.scanner.config['_formats']:
                pattern = re.compile(fnmatch.translate(pattern))
                self.pattern_parsers.append((pattern, self.scanner.config['_formats'][format]))

    def process(self, fp, filename, state):
        empty_message = {'data': [], 'timestamp': None, 'level': None, 'title': None}
        current_message = state.get('current_message', copy.deepcopy(empty_message))

        # if this filename matches any of our parsers, use the appropriate parser.
        # otherwise, use the default.
        parser = self.default_parser
        for pattern, _parser in self.pattern_parsers:
            if pattern.search(filename):
                parser = _parser
                break
        log.debug("File %s using parser %s", filename, parser['name'])

        scrub_regexes = []

        patterns_config = self.app['config']['scrub_regex_patterns'].split('\n')
        for pattern in patterns_config:
            pattern = pattern.strip()
            if not pattern:
                continue

            try:
                scrub_regexes.append(re.compile(pattern))
            except Exception as e:
                log.warning("Could not compile regex pattern: %s" % pattern)

        for line in fp:
            # does this look like the beginning of a new log message?
            line = clean_line(self.app['config'], line)
            match = parser['regex'].match(line)
            if match:
                if current_message['data']:
                    # done with the previous item - send it off and clear data
                    self._process_message(current_message, filename)
                    current_message = copy.deepcopy(empty_message)

                # save interesting data from first line
                current_message['timestamp'] = parse_timestamp(parser['datefmt'],
                    match.group('timestamp').strip())
                current_message['level'] = match.group('level').strip()
                current_message['title'] = match.group('title').strip()
                if 'thread_name' in match.groupdict():
                    current_message['thread_name'] = match.group('thread_name').strip()

            for regex in scrub_regexes:
                try:
                    line = regex.sub('******', line)
                except Exception as e:
                    log.warning("Could not use regex %s on line %s" % (regex, line))

            current_message['data'].append(line)

        if self.scanner.scan_start_time - state['mtime'] > 1:
            # it's been at least 1 second since anything was written to the file
            # if there's a pending message, send it
            if current_message['data']:
                self._process_message(current_message, filename)
                current_message = copy.deepcopy(empty_message)

        state['current_message'] = current_message

    def _process_message(self, message, filename):
        # sanitize level
        message['level'] = (message.get('level') or '').lower()

        # if the level is below our threshold, ignore it
        level = LOG_LEVEL.get(message['level'], 0)
        min_level = self.app['config']['min_log_level']
        if level < min_level:
            log.debug("Skipping message; level: %s min_level: %d", message['level'], min_level)
            return

        payload = self._build_payload(message, filename)
        self.send_payload(payload)

    def _build_payload(self, message, filename):
        message_data = "".join(message['data'])
        log.debug("Building message payload. Timestamp: %s Level: %s Title: %s Data: %s",
            message['timestamp'], message['level'], message['title'], message_data)

        app_config = self.app['config']

        # basic params
        data = {
            'timestamp': message['timestamp'],
            'environment': app_config['params.environment'],
            'level': message['level'],
            'notifier': {
                'name': 'rollbar-agent',
                'rollbar_agent_app': self.app['name'],
                'version': VERSION
            }
        }
        if app_config.get('params.language'):
            data['language'] = app_config['params.language']
        if app_config.get('params.framework'):
            data['framework'] = app_config['params.framework']

        # message body
        data['body'] = {
            'message': {
                'title': message['title'],
                'body': message_data
            }
        }

        # server environment
        data['server'] = {
            'log_file': filename,
            'host': self.app['host'],
            'branch': app_config['params.branch'],
            'root': app_config['params.root'],
        }

        payload = {
            'access_token': self.app['config']['params.access_token'],
            'data': data
        }
        return json.dumps(payload)


## main thread and loop

class ScannerThread(threading.Thread):
    """
    The main 'scanner' thread - scans files and posts items to the rollbar api.
    There should only be a single instance of this thread.
    """
    def __init__(self, stop_event, config):
        super(ScannerThread, self).__init__()
        self.stop_event = stop_event
        self.config = config

        self.apps = {}
        for app_name, app_config in iteritems(config):
            if app_name.startswith('_'):
                continue
            self.apps[app_name] = {
                'name': app_name,
                'config': app_config,
                'host': socket.gethostname(),
            }

    def run(self):
        sleep_seconds = self.config['_global']['sleep_time']
        while not self.stop_event.is_set():
            log.info("scanner thread looping...")
            start_time = time.time()

            try:
                self.scan_all()
            except:
                log.exception("Caught exception in ScannerThread.run() loop")

            if options.dry_run or options.skip_to_end:
                break

            # sleep for at most sleep_seconds seconds.
            wait_time = (start_time + sleep_seconds) - time.time()
            if wait_time > 0:
                self.stop_event.wait(wait_time)

    def scan_all(self):
        # we keep state in a dictionary like:
        # {'files': {'filename1': {'pos': 12345, 'inode': 4567}, ...}}
        self.scan_start_time = time.time()
        state = self.load_state()
        apps_state = state.get('apps', {})

        # Close the state file before recreating it
        self.save_state(state)

        try:
            for app in itervalues(self.apps):
                self.scan_app(app, apps_state)
        except:
            log.exception("Caught exception in ScannerThread.scan_all() loop")
        finally:
            # recreate the state file and save the new apps_state to trim
            # any excess space being used by the old state file
            state = self.load_state(recreate=True)
            state['apps'] = apps_state
            self.save_state(state)

    def load_state(self, recreate=False):
        if not options.dry_run:
            try:
                # the 'n' flag will wipe away the old file if it exsits, otherwise
                # the 'c' flag will only create the file if it doesn't exist
                # https://docs.python.org/2/library/anydbm.html#anydbm.open
                return shelve.open(self.config['_global']['statefile'],
                                   flag='n' if recreate else 'c',
                                   protocol=self.config['_global']['state_proto'])
            except dbm.error:
                log.error("Could not open statefile for writing. "
                    "Perhaps the directory doesn't exist? "
                    "You can configure this by modifying the 'statefile' in rollbar-agent.conf. "
                    "Current value is %s" % self.config['_global']['statefile'])
                raise
        else:
            return {}

    def save_state(self, state):
        if not options.dry_run:
            state.close()

    def scan_app(self, app, apps_state):
        log.debug("Scanning app %s", app['name'])
        app_state = apps_state.setdefault(app['name'], {})
        files_state = app_state.setdefault('files', {})

        filenames = find_filenames(app['config'])

        removed_filenames = set(files_state.keys()) - set(filenames)
        for filename in removed_filenames:
            files_state.pop(filename)

        for filename in filenames:
            try:
                stats = os.stat(filename)
            except OSError:
                log.debug("File %s no longer exists, continuing...", filename)
                continue

            log.debug("Processing file %s", filename)

            inode = stats[stat.ST_INO]
            mtime = stats[stat.ST_MTIME]
            if filename in files_state:
                # filename we've seen before.
                if inode != files_state[filename]['inode']:
                    # file has been rotated. reset to position 0 and store new inode.
                    log.debug("File %s appears to have been rotated.", filename)
                    files_state[filename] = {'pos': 0, 'inode': inode, 'mtime': mtime}
            else:
                # new file - initialize
                log.debug("Initializing new file %s", filename)
                files_state[filename] = {'pos': 0, 'inode': inode, 'mtime': mtime}

            self.scan_file(app, filename, files_state[filename])

        log.debug("Completed scan of app %s", app['name'])

    def scan_file(self, app, filename, file_state):
        log.debug("Scanning file %s", filename)
        processor = choose_processor(filename)(self, app)
        with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as fp:
            pos = file_state['pos']
            log.debug("File %s seeking to pos %d", filename, pos)
            fp.seek(pos)
            if options.skip_to_end:
                fp.seek(0, os.SEEK_END)
            else:
                processor.process(fp, filename, file_state)
            new_pos = fp.tell()
            log.debug("File %s new pos %d", filename, new_pos)
            file_state['pos'] = new_pos

        if app['config']['delete_processed_files'] and new_pos == os.stat(filename).st_size:
            # If we've finished reading the whole file and our config was set up to delete logs
            # after processing them, remove the file here
            os.remove(filename)


def register_signal_handlers(stop_event):
    def signal_handler(signum, frame):
        log.info("Shutting down...")
        stop_event.set()

    for sig in (signal.SIGINT, signal.SIGTERM, signal.SIGQUIT, signal.SIGALRM):
        signal.signal(sig, signal_handler)


def main_loop():
    stop_event = threading.Event()
    register_signal_handlers(stop_event)
    scanner = ScannerThread(stop_event, config)
    scanner.start()

    # sleep until the thread is killed
    # have to sleep in a loop, instead of worker.join(), otherwise we'll never get the signals
    while scanner.is_alive():
        time.sleep(1)

    log.info("Shutdown complete")


## configuration

def build_option_parser():
    parser = optparse.OptionParser()

    parser.add_option('-c', '--config', dest='config_file', action='store',
        default='rollbar-agent.conf', help='Path to configuration file. Default: rollbar-agent.conf in the working directory.')
    parser.add_option('--dry-run', dest='dry_run', action='store_true', default=False,
        help='Dry run: processes log files, but does not save state or submit events to Rollbar. Exits after processing once.')
    parser.add_option('--skip_to_end', dest='skip_to_end', action='store_true', default=False,
        help='Go through existing log files and save them in the state without processing them, so you do not process existing log info next run. Exits after processing once.')

    # verbosity
    verbosity = optparse.OptionGroup(parser, 'Verbosity')
    verbosity.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False,
        help='Verbose output (uses log level DEBUG)')
    verbosity.add_option('-q', '--quiet', dest='quiet', action='store_true', default=False,
        help='Quiet output (uses log level WARNING)')

    parser.add_option_group(verbosity)

    return parser


def parse_config(filename):
    # if the filename containing rollbar agent configuration does not exist, exit
    if not os.path.isfile(filename):
        logging.critical('The config file %s was not found.', filename)
        sys.exit(1)

    defaults = {
        'statefile': '/var/cache/rollbar-agent.state',
        'state_proto': '0',
        'sleep_time': '10',
        'endpoint': DEFAULT_ENDPOINT,
        'timeout': str(DEFAULT_TIMEOUT),
        'root': os.getcwd(),
        'ext_whitelist': 'log, rollbar',
        'ext_blacklist': '',
        'targets': '',
        'log_levels': str(LOG_LEVEL),
        'blacklist': '',
        'log_format.default': '',
        'log_format.patterns': '',
        'scrub_regex_patterns': '',
        'delete_processed_files': 'false',
        'filter_chr_attr_sequences': 'false',
    }

    def to_int(val):
        return int(val)
    def to_list(val):
        return re.split(r'\s+', val)
    def to_dict(val):
        log_level_dict = ast.literal_eval(val)
        return log_level_dict
    def to_log_level(val):
        return LOG_LEVEL[val.lower()]
    def to_bool(val):
        return val.lower() == 'true'

    parsers = {
        'state_proto': to_int,
        'sleep_time': to_int,
        'timeout': to_int,
        'ext_whitelist': to_list,
        'ext_blacklist': to_list,
        'targets': to_list,
        'blacklist': to_list,
        'log_levels': to_dict,
        'max_file_page': to_int,
        'min_log_level': to_log_level,
        'delete_processed_files': to_bool,
        'filter_chr_attr_sequences': to_bool,
    }

    if sys.version_info >= (3, 2):
        cp = configparser.ConfigParser(defaults)
    else:
        cp = configparser.SafeConfigParser(defaults)
    cp.read([filename])

    config = {'_formats': {}}
    for section_name in cp.sections():
        if section_name.startswith('app:'):
            app_name = section_name[len('app:'):]
            app = {'name': app_name}
            for option_name, raw_value in cp.items(section_name):
                if option_name in parsers:
                    value = parsers[option_name](raw_value)
                else:
                    value = raw_value
                app[option_name] = value

            config[app_name] = app
        elif section_name.startswith('format:'):
            format_name = section_name[len('format:'):]
            format = {'name': format_name}

            format_type = cp.get(section_name, 'type')
            format_spec = cp.get(section_name, 'format', raw=True)
            try:
                format_datefmt = cp.get(section_name, 'datefmt', raw=True)
            except configparser.NoOptionError:
                format_datefmt = DEFAULT_DATEFMT

            if format_type != 'python':
                log.warning("Unrecognized format type: %s", format_type)
                continue
            regex, datefmt = build_python_log_format_parser(format_spec, format_datefmt)
            format['regex'] = regex
            format['datefmt'] = datefmt

            config['_formats'][format_name] = format

    global_config = cp.defaults()
    config['_global'] = {}
    for option_name, raw_value in iteritems(global_config):
        if option_name in parsers:
            value = parsers[option_name](raw_value)
        else:
            value = raw_value
        config['_global'][option_name] = value

    return config


def validate_config(config):
    errors = []
    required_vars = ['params.access_token', 'targets']
    for app_name, app_config in iteritems(config):
        if app_name.startswith('_'):
            continue
        for var_name in required_vars:
            if not app_config.get('params.access_token'):
                errors.append("app:%s is missing required var %s" % (app_name, var_name))
    if errors:
        print("CONFIGURATION ERRORS")
        for error in errors:
            print(error)
        print()
        sys.exit(1)


if __name__ == '__main__':
    # first parse command-line options to get the path to the config file
    parser = build_option_parser()
    (options, args) = parser.parse_args()

    # now parse the config file
    config = parse_config(options.config_file)

    # validate - will exit if invalid
    validate_config(config)

    # set up logging
    level = logging.INFO
    if options.verbose:
        level = logging.DEBUG
    elif options.quiet:
        level = logging.WARNING

    formatter = logging.Formatter("%(asctime)s %(levelname)-5.5s %(message)s")
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    handler.setLevel(level)
    log.addHandler(handler)
    log.setLevel(level)

    # start main loop
    main_loop()
