# -*- coding:utf-8 -*-
import urllib
import re
import nltk
import html

class MalwareDataProcesser(object):
    def __init__(self, min_len):
        self.MIN_LEN = min_len
        self.tokens_pattern = r'''(?x)
             "[^"]+"
            |http://\S+
            |</\w+>
            |<\w+>
            |<\w+
            |\w+=
            |>
            |\w+\([^<]+\) 
            |\w+
            '''
        self.index_wordbag=1
        self.wordbag={}
        self.freqdist={}

    def do_str(self, line):
        words = nltk.regexp_tokenize(line, self.tokens_pattern)
        # print  words
        return words

    def load_freqdist(self, filename):
        tokens_list = []

        with open(filename) as f:
            for line in f:
                line = line.strip('\n')
                # url解码
                line = urllib.parse.unquote(line)

                # 处理html转义字符
                line = html.unescape(line)
                if len(line) >= self.MIN_LEN:
                    # print "Learning xss query param:(%s)" % line
                    # number replaced to 8
                    line, number = re.subn(r'\d+', "", line)
                    # ulr replaced to http://u
                    line, number = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:=]+', "http://u", line)
                    # clear the comments
                    line, number = re.subn(r'\/\*.?\*\/', "", line)
                    # print "Learning xss query etl param:(%s) " % line


        self.freqdist = nltk.FreqDist(tokens_list)  # 单文件词频
        #self.freqdist = self.freqdist.most_common(len(self.freqdist))
        # print(self.freqdist)
        #print(self.freqdist.keys())
        #print(self.freqdist.values())

    def getFreq(self):
        return self.freqdist

# testDataProcessor = MalwareDataProcesser(5)
# ## Load the frequency dictionary from the file "xss-train.txt" to the field self.freqdist
# testDataProcessor.load_freqdist("xss-train.txt")

