# SPDX-FileCopyrightText: 2018 Sebastian Wagner
#
# SPDX-License-Identifier: AGPL-3.0-or-later

# -*- coding: utf-8 -*-
"""
Parses CTIP data in JSON format.

Key indicatorexpirationdatetime is ignored, meaning is unknown.

There are two different variants of data

* Interflow format: JSON format, MAPPING
* Azure format: JSON stream format, a short example structure:

    .. code-block:: json

       {
         "DataFeed": "CTIP-Infected",
         "SourcedFrom": "SinkHoleMessage|SensorMessage"",
         "DateTimeReceivedUtc": nt time
         "DateTimeReceivedUtcTxt": human readable
         "Malware":
         "ThreatCode": "B67-SS-TINBA",
         "ThreatConfidence": "High|Medium|Low|Informational", -> 100/50/20/10
         "TotalEncounters": 3,
         "TLP": "Amber",
         "SourceIp":
         "SourcePort":
         "DestinationIp":
         "DestinationPort":
         "TargetIp": Deprecated, so we gonne ignore it
         "TargetPort": Deprecated, so we gonne ignore it
         "SourceIpInfo": {
           "SourceIpAsnNumber":
           "SourceIpAsnOrgName":
           "SourceIpCountryCode":
           "SourceIpRegion":
           "SourceIpCity"
           "SourceIpPostalCode"
           "SourceIpLatitude"
           "SourceIpLongitude"
           "SourceIpMetroCode"
           "SourceIpAreaCode"
           "SourceIpConnectionType"
         },
         "HttpInfo": {
           "HttpHost": "",
           "HttpRequest": "",
           "HttpMethod": "",
           "HttpReferrer": "",
           "HttpUserAgent": "",
           "HttpVersion": ""
         },
         "CustomInfo": {
           "CustomField1": "",
           "CustomField2": "",
           "CustomField3": "",
           "CustomField4": "",
           "CustomField5": ""
         },
         "Payload": base64 encoded json with meaningful dictionary keys or JSON-string with numbered dictionary keys
       }

"""
import json

import intelmq.lib.utils as utils
from intelmq.lib.bot import ParserBot
from intelmq.lib.harmonization import DateTime, FQDN

INTERFLOW = {"additionalmetadata": "extra.additionalmetadata",
             "description": "event_description.text",
             "externalid": "malware.name",
             "tlplevel": "tlp",
             "firstreporteddatetime": "time.source",
             "networksourceipv4": "source.ip",
             "networksourceport": "source.port",
             "networkdestinationipv4": "destination.ip",
             "networkdestinationport": "destination.port",
             "isproductlicensed": "extra.isproductlicensed",
             "ispartnershareable": "extra.ispartnershareable",
             "networksourceasn": "source.asn",
             "hostname": "destination.fqdn",
             "useragent": "extra.user_agent",
             "severity": "extra.severity",
             "tags": "extra.tags",
             }
AZURE = {
    "DataFeed": "feed.name",
    "SourcedFrom": "event_description.text",
    "DateTimeReceivedUtc": "time.source",
    "DateTimeReceivedUtcTxt": "__IGNORE__",
    "Malware": "extra.malware",
    "ThreatCode": "malware.name",
    "ThreatConfidence": "feed.accuracy",
    "TotalEncounters": "extra.total_encounters",
    "TLP": "tlp",
    "SourceIp": "source.ip",
    "SourcePort": "source.port",
    "DestinationIp": "destination.ip",
    # DestinationIpInfo.* fields are used in the ctip-c2 feed
    "DestinationIpInfo.DestinationIpAsnNumber": "destination.asn",
    "DestinationIpInfo.DestinationIpAsnOrgName": "destination.as_name",
    "DestinationIpInfo.DestinationIpCountryCode": "destination.geolocation.cc",
    "DestinationIpInfo.DestinationIpRegion": "destination.geolocation.region",
    "DestinationIpInfo.DestinationIpCity": "destination.geolocation.city",
    "DestinationIpInfo.DestinationIpPostalCode": "extra.destination.geolocation.postal_code",
    "DestinationIpInfo.DestinationIpLatitude": "destination.geolocation.latitude",
    "DestinationIpInfo.DestinationIpLongitude": "destination.geolocation.longitude",
    "DestinationIpInfo.DestinationIpMetroCode": "extra.destination.geolocation.metro_code",
    "DestinationIpInfo.DestinationIpAreaCode": "extra.destination.geolocation.area_code",
    "DestinationIpInfo.DestinationIpConnectionType": "extra.destination.connection_type",
    "DestinationIpInfo.DestinationIpv4Int": "__IGNORE__",
    "DestinationPort": "destination.port",
    "TargetIp": "__IGNORE__",
    "TargetPort": "__IGNORE__",
    "Signatures.Sha256": "extra.signatures.sha256",
    "SourceIpInfo.SourceIpAsnNumber": "source.asn",
    "SourceIpInfo.SourceIpAsnOrgName": "source.as_name",
    "SourceIpInfo.SourceIpCountryCode": "source.geolocation.cc",
    "SourceIpInfo.SourceIpRegion": "source.geolocation.region",
    "SourceIpInfo.SourceIpCity": "source.geolocation.city",
    "SourceIpInfo.SourceIpPostalCode": "extra.source.geolocation.postal_code",
    "SourceIpInfo.SourceIpLatitude": "source.geolocation.latitude",
    "SourceIpInfo.SourceIpLongitude": "source.geolocation.longitude",
    "SourceIpInfo.SourceIpMetroCode": "extra.source.geolocation.metro_code",
    "SourceIpInfo.SourceIpAreaCode": "extra.source.geolocation.area_code",
    "SourceIpInfo.SourceIpConnectionType": "extra.source.connection_type",
    "SourceIpInfo.SourceIpv4Int": "__IGNORE__",  # Duplicate of SourceIP
    "HttpInfo.HttpHost": "extra.http.host",
    "HttpInfo.HttpRequest": "extra.http.request",
    "HttpInfo.HttpMethod": "extra.http.method",
    "HttpInfo.HttpReferrer": "extra.http.referrer",
    "HttpInfo.HttpUserAgent": "extra.user_agent",
    "HttpInfo.HttpVersion": "extra.http.version",
    "CustomInfo.CustomField1": "extra.custom_field1",
    "CustomInfo.CustomField2": "extra.custom_field2",
    "CustomInfo.CustomField3": "extra.custom_field3",
    "CustomInfo.CustomField4": "extra.custom_field4",
    "CustomInfo.CustomField5": "extra.custom_field5",
    "Payload.ts": "extra.payload.timestamp",
    "Payload.ip": "extra.payload.ip",
    "Payload.port": "extra.payload.port",
    "Payload.serverIp": "extra.payload.server.ip",
    "Payload.serverPort": "extra.payload.server.port",
    "Payload.domain": "destination.fqdn",
    "Payload.family": "extra.payload.family",
    "Payload.malware": "extra.payload.malware",
    "Payload.response": "extra.payload.response",
    "Payload.handler": "extra.payload.handler",
    "Payload.type": "protocol.application",
    "Payload": "extra.payload.text",
    "Payload.Time": "extra.payload.time",
    "Payload.SourceIP": "extra.payload.source.ip",
    "Payload.DestIP": "extra.payload.destination.ip",
    "Payload.RemotePort": "extra.payload.remote.port",
    "Payload.RemoteHost": "extra.payload.remote.host",
    "Payload.ServerPort": "extra.payload.server.port",
    "Payload.BCode": "extra.payload.b_code",
    "Payload.Protocol": "extra.payload.protocol",
    "Payload.Length": "extra.payload.length",
    "Payload.URI": "destination.urlpath",
    "Payload.Referer": "extra.http_referer",
    "Payload.UserAgent": "extra.user_agent",
    "Payload.RequestMethod": "extra.http.method",
    "Payload.HTTPHost": "extra.http.host",
    "Payload.http_host": "extra.payload.http_host",
    "Payload.Custom1": "extra.payload.custom_field1",
    "Payload.Custom2": "extra.payload.custom_field2",
    "Payload.Custom3": "extra.payload.custom_field3",
    "Payload.Custom4": "extra.payload.custom_field4",
    "Payload.Custom5": "extra.payload.custom_field5",
    "Payload.timestamp": "extra.payload.timestamp",
    "Payload.timestamp_utc": "extra.payload.timestamp_utc",
    "Payload.source_ip": "extra.payload.source.ip",
    "Payload.source_port": "extra.payload.source.port",
    "Payload.src_port": "extra.payload.source.port",
    "Payload.destination_ip": "extra.payload.destination.ip",
    "Payload.dst_ip": "extra.payload.destination.ip",
    "Payload.destination_port": "extra.payload.destination.port",
    "Payload.dst_port": "extra.payload.destination.port",
    "Payload.computer_name": "extra.payload.computer_name",
    "Payload.bot_id": "extra.payload.bot_id",
    "Payload.asn": "extra.payload.source.asn",
    "Payload.dst_asn": "extra.payload.destination.asn",
    "Payload.geo": "extra.payload.source.geolocation.cc",
    "Payload.dst_geo": "extra.payload.destination.geolocation.cc",
    "Payload.url": "extra.request_raw",
    "Payload.http_agent": "extra.http_agent",
    "Payload.p0f_genre": "extra.os.name",
    "Payload.p0f_detail": "extra.os.version",
    "Payload.http_post": "extra.payload.http_post",
    "Payload.naics": "extra.naics",
    "Payload.sector": "extra.sector",
    "Payload.ssl_cipher": "extra.ssl_cipher",
}
CONFIDENCE = {
    "High": 100,
    "Medium": 50,
    "Low": 20,
    "Informational": 10,
}


class MicrosoftCTIPParserBot(ParserBot):
    """Parse JSON data from Microsoft's CTIP program"""
    overwrite: bool = True  # overwrite existing fields

    def parse(self, report):
        raw_report = utils.base64_decode(report.get("raw"))
        if raw_report.startswith('['):
            # Interflow
            self.recover_line = self.recover_line_json
            yield from self.parse_json(report)
        elif raw_report.startswith('{'):
            # Azure
            self.recover_line = self.recover_line_json_stream
            yield from self.parse_json_stream(report)
        else:
            raise ValueError("Can't parse the received message. It is neither a JSON list nor a JSON dictionary. Please report this bug.")

    def parse_line(self, line, report):
        if line.get('version', None) == 1.5:
            yield from self.parse_interflow(line, report)
        else:
            yield from self.parse_azure(line, report)

    def parse_interflow(self, line: dict, report):
        raw = self.recover_line(line)
        if line['indicatorthreattype'] != 'Botnet':
            raise ValueError('Unknown indicatorthreattype %r, only Botnet is supported.' % line['indicatorthreattype'])
        if 'additionalmetadata' in line and line['additionalmetadata'] in [[], [''], ['null'], [None]]:
            del line['additionalmetadata']
        event = self.new_event(report)
        for key, value in line.items():
            if key in ['version', 'indicatorthreattype', 'confidence', 'indicatorexpirationdatetime']:
                continue
            if key == "firstreporteddatetime":
                value += ' UTC'
            if key == "hostname" and value == line["networkdestinationipv4"]:  # ignore IP in FQDN field
                continue
            if key == "hostname" and not event.is_valid("source.fqdn", value):
                # can contain very weird characters
                continue
            if key == 'networkdestinationipv4' and value == '0.0.0.0':
                continue
            if key == 'networkdestinationipv4' and ',' in value:
                """
                data contains:
                "networkdestinationipv4": "192.88.99.209, 192.88.99.209",
                since 2019-03-14, reported upstream, IP addresses are always the same
                """
                value = value[:value.find(',')]
            event[INTERFLOW[key]] = value
        event.add('feed.accuracy',
                  event.get('feed.accuracy', 100) * line['confidence'] / 100,
                  overwrite=True)
        event.add('classification.type', 'infected-system')
        event.add('raw', raw)
        yield event

    def parse_azure(self, line, report):
        raw = self.recover_line()

        event = self.new_event(report)

        for key, value in line.copy().items():
            if key == 'Payload':
                # empty
                if value == 'AA==':  # NULL
                    del line[key]
                    continue

                # JSON string
                if value.startswith('{'):
                    for payload_key, payload_value in json.loads(value).items():
                        event[f'extra.payload.{payload_key}'] = payload_value
                    del line[key]
                else:
                    # base64-encoded JSON
                    try:
                        value = json.loads(utils.base64_decode(value))
                        # continue unpacking in next loop
                    except json.decoder.JSONDecodeError:
                        line[key] = utils.base64_decode(value)
            elif key == 'TLP' and value.lower() == 'unknown':
                del line[key]
            if isinstance(value, dict):
                for subkey, subvalue in value.items():
                    line[f'{key}.{subkey}'] = subvalue
                del line[key]
        for key, value in line.items():
            if key == 'ThreatConfidence':
                if value == 'None':
                    continue
                value = event.get('feed.accuracy', 100) * CONFIDENCE[value] / 100
            elif key == 'DateTimeReceivedUtc':
                value = DateTime.from_windows_nt(value)
            elif key == 'Payload.ts':
                value = DateTime.from_timestamp(value)
            elif key == 'Payload.Protocol':
                payload_protocol = value[:value.find('/')]
                if payload_protocol:
                    # needs to overwrite a field previously parsed and written
                    event.add('protocol.application', payload_protocol, overwrite=True)  # "HTTP/1.1", save additionally
            elif key == 'Payload.domain':
                # Sometimes the destination address is also given as domain, ignore it here as we already save it as destination.ip (see https://github.com/certtools/intelmq/pull/2144)
                if not FQDN.is_valid(value) and value == line.get('Payload.serverIp'):
                    continue
            elif not value:
                continue
            if AZURE[key] != '__IGNORE__':
                # feed.accuracy is calculated newly and always needs to be overwritten
                event.add(AZURE[key], value, overwrite=self.overwrite or AZURE[key] == "feed.accuracy")
        event.add('classification.type', 'infected-system')
        event.add('raw', raw)
        yield event


BOT = MicrosoftCTIPParserBot
