#!/usr/bin/env python3
# -*- coding: UTF-8 -*-
import argparse
import base64
import gc
import json
import logging
import os
import re
import requests
import shutil
import signal
import sys
from datetime import date
from os.path import basename, dirname, exists, join, splitext
from subprocess import call, Popen, PIPE, STDOUT
P3 = sys.version_info >= (3,0)
# Python2/3 specific imports
if P3:
    from urllib.request import getproxies, url2pathname
    from urllib.parse import unquote, urlparse, urljoin
else:
    from urllib import getproxies, unquote, url2pathname
    from urlparse import urljoin, urlparse
# BeautifulSoup
try:
    import bs4
except ImportError:
    print("BeautifulSoup is not installed !\nPlease run 'sudo pip install beautifulsoup4' before continuing.")
    sys.exit(1)
# colorize logging
try:
    import coloredlogs
    colored_logs_present = True
except ImportError:
    print("(Install 'coloredlogs' for colored logging)")
    colored_logs_present = False
# disable annoying requests warnings
try:
    from requests.packages.urllib3.exceptions import InsecureRequestWarning
    requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
except AttributeError:
    print("Failed to disable warnings for requests !\nPlease run 'sudo pip install --upgrade requests' to fix it.")
    sys.exit(1)
logging.getLogger("requests").setLevel(logging.CRITICAL)


__author__    = "Alexandre D'Hondt"
__email__     = "alexandre.dhondt@gmail.com"
__copyright__ = "© 2017-%d A. D'Hondt" % date.today().year
__version__   = "1.18"
__license__   = """License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law."""
__repository__ = "https://github.com/dhondta/webgrep"
__doc__        = """Search for PATTERN in each input URL and its related resources (images, scripts and style sheets).
By default,
 - resources are NOT downloaded
 - response HTTP headers are NOT included in grepping ; use '--include-headers'
 - PATTERN is a basic regular expression (BRE) ; use '-E' for extended (ERE)
Important note: webgrep does not handle recursion (in other words, it does not spider additional web pages).
Examples:
  webgrep example http://www.example.com     # will only grep on HTML code
  webgrep -r example http://www.example.com  # will only grep on LOCAL images, ...
  webgrep -R example http://www.example.com  # will only grep on ALL images, ...
"""


# -------------------- CONSTANTS SECTION --------------------
CSS_URL_REGEX = re.compile(b'url\(([^)]+)\)' if P3 else r'url\(([^)]+)\)')
DATE_FORMAT = '%H:%M:%S'
DEVNULL = open(os.devnull, 'w')
DATA_NAME = '{}-{:>03}.{}'
LOG_FORMAT = '%(asctime)s [%(levelname)s] %(message)s'
PROXIES = getproxies() or {k.split('_')[0].lower(): os.environ[k] for k in ['HTTP_PROXY', 'HTTPS_PROXY', 'FTP_PROXY'] \
                           if k in os.environ.keys()}
TEMP_DIR = '/tmp/webgrep'
CACHE_FILE = "cache.json"


# -------------------- HANDLERS SECTION ---------------------
# Categories:
# - preprocessors: applied on downloaded resources ; these REPLACE the resource
# - tools: applied after resource download and processing ; these DERIVE new resources
# Naming convention:
#   [resource type]_[preprocessors|tools]
#   e.g. STYLE_TOOLS, IMAGE_PREPROCESSORS
# Structure: dictionary with
# - key: the name of the binary or module
# - value: tuple of the form (type, handler, message if not exists)
#    NB: the handler always takes a Resource object in argument
# Handlers are automatically set as attributes in 'args'
b = lambda c: (bytes(c, 'utf-8') if not isinstance(c, bytes) else c) if P3 \
              else (c.encode('utf-8') if isinstance(c, unicode) else c)
s = lambda c: (c.decode("utf-8") if isinstance(c, bytes) else c) if P3 else c
get_cmd = lambda t: lambda r: Popen([t, r.rel_fn], stdout=PIPE, stderr=DEVNULL, cwd=args.tmp).communicate()[0]

def css_unminifier(res, indent=2, maxlen=256):
    """ Minimalistic CSS unminifying function. """
    if res.type != "style":
        return
    # CSS is considered minified if any line is longer than a given length
    res.content, i = b(res.content), indent
    if any([len(l) > maxlen for l in res.content.split(b('\n'))]):
        res.content = re.sub(b("\*\/"), b("*/\r\n"), res.content)
        res.content = re.sub(b("\{"), b(" {\r\n") + b(" ") * i, res.content)
        res.content = re.sub(b(";"), b(";\r\n") + b(" ") * i, res.content)
        res.content = re.sub(b("\}"), b(";\r\n}\r\n"), res.content)
    return res.content


def inline_items(tag, rtype=None, attrs=None):
    """ Decorator for extracting inline items from a Web page. """
    if not rtype:
        rtype = tag
    if not attrs:
        attrs = {}
    def _inline_items(page):
        """ Inline item extraction function. """
        for item in page.soup.find_all(tag, **attrs):
            inline = "data:inline/{};none,...".format(rtype)
            with Resource(inline, page) as res:
                res.type = rtype
                if not hasattr(res, "content"):  # not in cache
                    logger.debug("> Extracting {}".format(res.abs_fn))
                    res.content = item.text or item.html or item.string
                    res.preprocess()
                    with open(res.abs_fn, 'wb') as f:
                        f.write(b(res.content))
                item.decompose()
                res.grep().handle()
        return b(str(page.soup))
    return _inline_items


def steghide(res):
    """ Steghide handler for applying steganography on images (only trying an empty passphrase). """
    if res.type != "image":
        return
    tmp_file = "/tmp/steghide-extracted-text"
    cmd = 'steghide extract -sf {} -p "" -f -xf {}'.format(res.abs_fn, tmp_file)
    p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
    _, output = p.communicate()
    if b(tmp_file) in output:
        with open(tmp_file, 'rb') as f:
            content = f.read()
        os.remove(tmp_file)
        return content
    else:
        return b("")


def tesseract(res):
    """ Tesseract handler for applying OCR on images. """
    if res.type != "image":
        return
    output = []
    for i in range(3,11):
        cmd = ['tesseract', res.rel_fn, 'stdout', '-psm', str(i)]
        result = Popen(cmd, stdout=PIPE, stderr=DEVNULL, cwd=args.tmp).communicate()[0].strip()
        if len(result) > 0:
            for line in result.split(b('\n')):
                if line not in output:
                    output.append(line)
    return b('\n').join(output)


RESOURCE_TYPES = ['page', 'image', 'script', 'style']
IMAGE_TOOLS = {
    "exiftool": ('binary', get_cmd("exiftool"), "Binary required for getting image EXIF info ;\n consider running "
                 "'sudo apt-get install exiftool'"),
    "strings": ('binary', get_cmd("strings"), "Binary required for getting strings from downloaded files ;\n consider "
                "running 'sudo apt-get install strings'"),
    "steghide": ('binary', steghide, "Binary required for applying steganography on images ;\n consider running "
                 "'sudo apt-get install steghide'"),
    "tesseract": ('binary', tesseract, "Binary required for trying OCR on images ;\n consider running "
                  "'sudo apt-get install tesseract-ocr'"),
}
PAGE_PREPROCESSORS = {
    "inline-script": ('function', inline_items("script", attrs={'src': False}), None),
    "inline-style":  ('function', inline_items("style"), None),
}
SCRIPT_PREPROCESSORS = {
    "jsbeautifier": ('module', lambda r: b(jsbeautifier.beautify(s(r.content))), "Python library required for "
                     "deobfuscating Javascript ;\n consider running 'sudo pip install jsbeautifier'"),
}
STYLE_PREPROCESSORS = {
    "unminifier": ('function', css_unminifier, None),
}


# -------------------- FUNCTIONS SECTION --------------------
def __exit_handler(signal=None, frame=None, code=0):
    """ Exit handler.

    :param signal: signal number
    :param stack: stack frame
    :param code: exit code
    """
    if 'args' in globals():
        if args.keep:
            if args.cache:
                logger.debug("Saving cache at {}".format(args.cache_file))
                with open(args.cache_file, 'w') as f:
                    json.dump(args.cache, f, indent=2)
            logger.info("Temporary files are available at {}".format(args.tmp))
        else:
            logger.debug("Removing temporary folder {}".format(args.tmp))
            shutil.rmtree(args.tmp)
    logging.shutdown()
    sys.exit(code)
# bind termination signal (Ctrl+C) to exit handler
signal.signal(signal.SIGINT, __exit_handler)


def __installed(item, itype, message=None):
    """ Item existence check ; display message if the item is not installed and return boolean so that it can be
         determined if the item can be used.

    :param item: item to be checked
    :param itype: item type (binary | module | function)
    :param message: message to be displayed if the item is not installed
    :return: True if the item is installed, False otherwise.
    """
    if itype == 'binary':
        try:
            call([item, '--help'], stdout=DEVNULL, stderr=STDOUT)
            return True
        except OSError:
            if message is not None:
                logger.warning(message)
            return False
    elif itype == 'module':
        try:
            globals()[item] = __import__(item)
            return True
        except ImportError:
            if message is not None:
                logger.warning(message)
            return False
    elif itype == 'function':
        # no check required ; this relates to a locally declared function
        return True
    else:
        logger.warning("Unknown item type ({})".format(itype))


# --------------------- CLASSES SECTION ---------------------
class ArgCollectOption(argparse.Action):
    """ Argparse action for handling keyword arguments collection in grep_opts so that these can be passed to grep. """
    def __call__(self, parser, args, values, option_string=None):
        if not hasattr(args, "grep_opts") or args.grep_opts is None:
            args.grep_opts = []
        if values is not None:
            if len(self.dest) == 1:
                args.grep_opts.append("-" + self.dest)
                args.grep_opts.append("{}".format(values))
            elif len(self.dest) > 1:
                dest = self.dest.replace('_', '-')
                args.grep_opts.extend(["--" + dest, values])
        delattr(args, self.dest)


class ArgVersion(argparse.Action):
    """ Display version by calling -V/--version. """
    def __call__(self, parser, args, values, option_string=None):
        print("webgrep {}".format(__version__))
        print(__copyright__)
        print("\nWritten by {}, see <{}>.".format(__author__, __repository__))
        globals()['__exit_handler']()


class GetHeader(argparse.Action):
    """ Collect HTTP headers. """
    def __call__(self, parser, args, values, option_string=None):
        if not hasattr(args, "headers") or args.headers is None:
            args.headers = {}
        if values is not None:
            args.headers[self.dest.capitalize()] = values
        delattr(args, self.dest)


#based on: https://stackoverflow.com/questions/10123929/fetch-a-file-from-a-local-url-with-python-requests
class LocalFileAdapter(requests.adapters.BaseAdapter):
    """ Protocol Adapter to allow Requests to GET file:// URL's. """
    @staticmethod
    def _check_path(method, path):
        """Return an HTTP status for the given filesystem path."""
        if method.lower() in ["delete", "option", "put", "trace"]:
            return 501, "Not Implemented"
        elif method.lower() not in ["get", "head"]:
            return 405, "Method Not Allowed"
        elif os.path.isdir(path):
            return 400, "Path Not A File"
        elif not os.path.isfile(path):
            return 404, "File Not Found"
        elif not os.access(path, os.R_OK):
            return 403, "Access Denied"
        return 200, "OK"

    def send(self, request, **kwargs):
        """ Return the file specified by the given request. """
        path, resp = os.path.normcase(os.path.normpath(url2pathname(request.path_url))), requests.Response()
        resp.status_code, resp.reason = self._check_path(request.method, path)
        if resp.status_code == 200 and request.method.lower() != "head":
            try:
                resp.raw = open(path, 'rb')
            except (OSError, IOError) as err:
                resp.status_code = 500
                resp.reason = str(err)
        resp.url = request.url.decode('utf-8') if isinstance(request.url, bytes) else request.url
        resp.request = request
        resp.connection = self
        return resp

    def close(self):
        pass


class ProxySetting(argparse.Action):
    """ Manually set proxy setting. """
    def __call__(self, parser, args, values, option_string=None):
        if values is not None:
            PROXIES[self.dest] = values
        delattr(args, self.dest)


class Resource(object):
    """ Class for downloading web page and its related resources and for grepping each downloaded data. """
    grep_exclude = ["image"]
    headers = {'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64;"
                             " rv:50.0) Gecko/20100101 Firefox/50.0",
               'Accept': "text/html,application/xhtml+xml,application/"
                         "xml;q=0.9,*/*;q=0.8",
               'Accept-Language': "en-US,en;q=0.5",
               'Accept-Encoding': "gzip, deflate",
               'Connection': "keep-alive",
               'DNT': "1",
               'Upgrade-Insecure-Requests': "1"}

    def __init__(self, rurl, parent=None):
        self.url = rurl
        self.parent = parent
        self.path = dirname(rurl) + "/"
        parsed = urlparse(rurl)
        self.base = "{}://{}".format(parsed.scheme, parsed.netloc)
        self.netloc = parsed.netloc
        self._download = True
        self._error = False
        self._raw_data = 0
        self._main = parent is None or parsed.scheme == "data"
        self._same_origin = parent is None or self.base == parent.base or parsed.scheme == 'data'  # i.e. embedded image
        # check if the resource is an embedded data
        if parsed.scheme == "data":
            self._download = False
            # first, decompose the path (data:[parsed.path])
            #  form: [category]/[type];[encoding],[data]
            #  e.g.: image/png;base64,...
            try:
                rtype, data = parsed.path.split(";", 1)
                self.type, ext = rtype.split('/', 1)
                enc, data = data.split(",", 1)
            except ValueError:  # occurs e.g. with "image/svg+xml,%3Csvg..."
                rtype, data = parsed.path.split(",", 1)
                self.type, ext = rtype.split('/', 1)
                ext = ext.split("+", 1)[0]
                data = unquote(data)
                enc = "none"
            try:
                if enc.strip() not in ["none", "base64"]:
                    raise Exception("Bad image encoding")
            except Exception as e:
                logger.warning("{} (from {})".format(str(e), self.parent.rel_fn))
                self._error = True
                return
            fn = DATA_NAME.format(self.type, self.parent._raw_data, ext)
            self.rel_fn = "{}_{}".format(self.parent.rel_fn, fn)
            self.abs_fn = "{}_{}".format(self.parent.abs_fn, fn)
            if len(data.strip('.')) > 0:
                with open(self.abs_fn, 'wb') as f:
                    content = data if enc == "none" else base64.b64decode(data)
                    f.write(b(content))
            self.parent._raw_data += 1
        # otherwise, compute resource paths
        else:
            url_path = parsed.path
            fn = basename(url_path) or "index.html"
            fp = dirname(url_path).strip('/')
            self.rel_fn = join(self.netloc, fp, fn)
            self.abs_fn = join(args.tmp, self.rel_fn)
        abs_fp = dirname(self.abs_fn)
        if not exists(abs_fp):
            os.makedirs(abs_fp)

    def __enter__(self):
        return self._load()

    def __exit__(self, *unused):
        self._cache()
        del self
        gc.collect()

    def _allowed(self):
        """ Handle only same-origin or also external resources. """
        return self._main or args.inc_all or (args.inc_local and self._same_origin)

    def _cache(self):
        """ Resource caching for use with '--keep-files'. """
        if args.cache is not None and self.rel_fn not in args.cache[url].keys():
            logger.debug("> Registering '{}' to cache".format(self.rel_fn))
            if not hasattr(self, "type"):
                self._update_type()
            if self.type != "undefined":
                args.cache.setdefault(url, {})
                args.cache[url][self.rel_fn] = self.type

    def _load(self):
        """ Resource loading for use with '--keep-files', allowing to re-grep on different keywords without
         re-downloading everything. """
        if exists(self.abs_fn) and args.cache is not None and self.rel_fn in args.cache[url].keys():
            with open(self.abs_fn, 'rb') as f:
                self.content = f.read()
            self.type = args.cache[url][self.rel_fn]
            logger.debug("> Loading '{}' from cache".format(self.rel_fn))
            return self
        else:
            return self.download()

    def _update_type(self):
        """ Update resource type based on HTTP response content type. """
        if self._error:
            self.type = "failure"
            return
        try:
            ct = self.response.headers['Content-Type'].split(';')[0].split(',')
        except:
            self.type = "undefined"
            return
        _, ext = splitext(self.url)
        # only consider the following MIME types
        if any(c.startswith("image") for c in ct):
            self.type = "image"
        elif "text/css" in ct:
            self.type = "style"
        elif "application/javascript" in ct or \
            (any(c in ["application/octet-stream", "text/plain", "text/html"] for c in ct) and ext == ".js"):
            self.type = "script"
        elif "text/html" in ct:
            self.type = "page"
        else:
            self.type = "other"
        logger.debug("> Resource type set to '{}'".format(self.type))

    def download(self):
        """ Download the resource then save it to a temporary folder. Afterwards, trigger grep and particular handling.
        """
        if not self._download or not self._allowed():
            return self
        logger.debug("> Downloading {}".format(self.url))
        self.headers.update(args.headers)
        req = requests.Request('GET', self.url, headers=self.headers).prepare()
        try:
            resp = session.send(req, proxies=PROXIES if args.proxy else {}, verify=False, stream=True)
        except requests.exceptions.ProxyError as e:
            if bool(PROXIES):
                # try next requests disabling the proxy
                logger.debug("Disabling proxy settings")
                globals()['PROXIES'] = {}
                resp = session.send(req, proxies=PROXIES if args.proxy else {}, verify=False, stream=True)        
            else: # proxy already disabled, just throw the exception
                raise e
        # attach HTTP headers if they are to be grepped
        if args.inc_headers:
            self.resp_headers = resp.headers
        # print debug information if necessary
        Resource.pprint_req(req)
        Resource.pprint_resp(resp)
        # adjust resource type if necessary
        self.response = resp
        self._error = resp.status_code != 200
        self._update_type()
        if resp.status_code == 200:
            # images handled separately in chunks, no preprocessor applied
            if self.type == "image":
                with open(self.abs_fn, 'wb') as f:
                    for chunk in resp:
                        f.write(b(chunk))
                self.content = ''
            # other types of resources are handled with preprocessors
            else:
                self.content = resp.text.encode(resp.encoding) if resp.encoding is not None else resp.text
                if self.type == "page":
                    self.soup = bs4.BeautifulSoup(self.content, 'html.parser')
                self.preprocess()
                with open(self.abs_fn, 'wb') as f:
                    f.write(b(self.content))
            self._cache()
        elif resp.status_code == 204:  # No Content
            pass
        else:
            reason = "({} - {})".format(resp.status_code, resp.reason)
            logger.error("Failed to get {} {}".format(self.url, reason))
        return self

    def grep(self):
        """ Call built-in grep command on the downloaded resource then remove it. """
        if not self._allowed() or self._error or self.type in self.grep_exclude: # or not hasattr(self, "type")
            return self
        logger.debug("> Grepping {}".format(self.abs_fn))
        # make the command from input arguments
        tokens = ['grep', '--color=always']
        if hasattr(args, "grep_opts"):
            tokens.extend(args.grep_opts or [])
        if hasattr(args, "pattern") and args.pattern is not None:
            tokens.append(args.pattern)
        tokens.append(self.rel_fn)
        cmd = ' '.join(tokens)
        # then execute it
        logger.debug(">> Command: {}".format(cmd))
        out = Popen(cmd, stdout=PIPE, stderr=STDOUT, cwd=args.tmp, shell=True).communicate()[0]
        # NB: despite the fact that shell=True is not recommended, it is used as, otherwise, the command is not properly
        #      handled regarding patterns with quotes
        if len(out) > 0:
            print(out.decode('utf-8'))
        return self

    def handle(self):
        """ Handle the downloaded resource according to its type.
        - web page:     search for images (also handle raw images), scripts and style sheets
        - images:       apply multiple tools then only grep on these results
        - scripts:      try to deobfuscate then also grep on these results
        - style sheets: search for image references then download them for normal image handling
        """
        if not self._allowed() or self._error:
            return self
        # check for headers if they are to be included
        if args.inc_headers:
            with Resource("data:other/headers;none,...", self) as res:
                if not hasattr(res, "content"):  # not in cache
                    # 'resp_headers' does not exist when --keep-files was used as the resource is not downloaded again
                    if hasattr(self, "resp_headers"):  # download performed
                        with open(res.abs_fn, 'wb') as f:
                            f.write(b('\n'.join(": ".join(item) for item in self.resp_headers.items())))
                res.grep()
        # then, derive new resources by applying available tools as defined in the global constants (see HANDLERS
        #  SECTION above)
        attr = "{}_tools".format(self.type)
        if hasattr(args, attr):
            # create a new resource for each time a tool is executed
            for tool, handler in getattr(args, attr):
                raw = "data:{}/txt;none,...".format(tool)
                with Resource(raw, self) as res:
                    res.type = "derived"
                    if not hasattr(res, "content"):  # not in cache
                        logger.debug("> Deriving {}".format(res.abs_fn))
                        with open(res.abs_fn, 'wb') as f:
                            f.write(b(handler(self)))
                    res.grep()
        # now, handle resource for some particular types
        if self.type == "page":
            if not args.inc_local and not args.inc_all:
                return
            if not hasattr(self, "soup"):
                self.soup = bs4.BeautifulSoup(self.content, 'html.parser')
            # download images
            for img in self.soup.find_all("img", src=True):
                with Resource(urljoin(self.path, img['src']), self) as res:
                    res.grep().handle()
            # download scripts
            for script in self.soup.find_all("script", src=True):
                with Resource(urljoin(self.path, script.attrs['src']), self) as res:
                    res.grep().handle()
            # download styles
            for link in self.soup.find_all("link", href=True, rel="stylesheet"):
                with Resource(urljoin(self.path, link.attrs['href']), self) as res:
                    res.grep().handle()
        elif self.type == "style":
            # search for image URLs in the style sheet
            for u in CSS_URL_REGEX.findall(self.content):
                u = u.strip(b("\'\"")).decode()
                if u.startswith("#"):  # exclude anchor tags
                    continue
                with Resource(urljoin(self.path, u), self) as res:
                    res.grep().handle()

    def preprocess(self):
        """ Preprocess the resource according to its type. """
        attr = '{}_preprocessors'.format(self.type)
        if hasattr(args, attr):
            for tool, handler in getattr(args, attr):
                logger.debug("> Preprocessing with {}".format(tool))
                self.content = handler(self)
            self.content = b(self.content)

    @staticmethod
    def pprint_req(request):
        """ Pretty print method for debugging HTTP communication. """
        data = "\n\n    {}\n".format(request.body) if request.method == "POST" else "\n"
        logger.debug("Sent request:\n    {} {}\n{}".format(request.method, request.url,
                     '\n'.join('    {}: {}'.format(k, v) for k, v in sorted(request.headers.items()))) + data)

    @staticmethod
    def pprint_resp(response):
        """ Pretty print method for debugging HTTP communication. """
        logger.debug("Received response:\n    {} {}\n{}\n".format(response.status_code, response.reason,
                     '\n'.join('    {}: {}'.format(k, v) for k, v in sorted(response.headers.items()))))


# ----------------------- MAIN SECTION ----------------------
if __name__ == '__main__':
    global logger, session, args, url
    parser = argparse.ArgumentParser(
        prog=__file__.lstrip('./'), description=__doc__, add_help=False,
        usage="%s [OPTION]... PATTERN [URL]..." % basename(__file__),
        epilog="Please report bugs on GitHub: {}/issues/new".format(__repository__),
        formatter_class=argparse.RawTextHelpFormatter,
    )
    # argument groups are willingly the same as in grep
    regex = parser.add_argument_group("Regexp selection and interpretation")
    regex_pat = regex.add_mutually_exclusive_group()
    regex_pat.add_argument("-e", "--regexp", action=ArgCollectOption, help="use PATTERN for matching")
    regex_pat.add_argument("-f", "--file", action=ArgCollectOption, help="obtain PATTERN from FILE")
    regex_type = regex.add_mutually_exclusive_group()
    regex_type.add_argument("-E", "--extended-regexp", dest="grep_opts", const="-E", action="append_const",
                            help="PATTERN is an extended regular expression (ERE)")
    regex_type.add_argument("-F", "--fixed-strings", dest="grep_opts", const="-F", action="append_const",
                            help="PATTERN is a set of newline-separated fixed strings")
    regex_type.add_argument("-G", "--basic-regexp", dest="grep_opts", const="-G", action="append_const",
                            help="PATTERN is a basic regular expression (BRE)")
    regex_type.add_argument("-P", "--perl-regexp", dest="grep_opts", const="-P", action="append_const",
                            help="PATTERN is a Perl regular expression")
    regex.add_argument("-i", "--ignore-case", dest="grep_opts", const="-i", action="append_const",
                       help="ignore case distinctions")
    regex.add_argument("-w", "--word-regexp", dest="grep_opts", const="-w", action="append_const",
                       help="force PATTERN to match only whole words")
    regex.add_argument("-x", "--line-regexp", dest="grep_opts", const="-x", action="append_const",
                       help="force PATTERN to match only whole lines")
    regex.add_argument("-z", "--null-data", dest="grep_opts", const="-z", action="append_const",
                       help="a data line ends in 0 byte, not newline")
    misc = parser.add_argument_group("Miscellaneous")
    misc.add_argument("-s", "--no-messages", dest="grep_opts", const="-s", action="append_const",
                      help="suppress error messages")
    misc.add_argument("-v", "--invert-match", dest="grep_opts", const="-v", action="append_const",
                      help="select non-matching lines")
    misc.add_argument("-V", "--version", nargs=0, action=ArgVersion, help="print version information and exit")
    misc.add_argument("--help", action="help", help="display this help and exit")
    # ----- START new arguments -----
    misc.add_argument("--verbose", action="store_true", help="verbose mode")
    misc.add_argument("--keep-files", dest="keep", action="store_true",
                      help="keep temporary files in the temporary directory")
    misc.add_argument("--temp-dir", dest="tmp", default=TEMP_DIR,
                      help="define the temporary directory (default: {})".format(TEMP_DIR))
    # ----- END new arguments -----
    output = parser.add_argument_group("Output control")
    output.add_argument("-m", "--max-count", dest="m", metavar="NUM", type=int, action=ArgCollectOption,
                        help="stop after NUM matches")
    output.add_argument("-b", "--byte-offset", dest="grep_opts", const="-b", action="append_const",
                        help="print the byte offset with output lines")
    output.add_argument("-n", "--line-number", dest="grep_opts", const="-n", action="append_const",
                        help="print line number with output lines")
    output.add_argument("--line-buffered", dest="grep_opts", const="--line-buffered", action="append_const",
                        help="flush output on every line")
    output.add_argument("-H", "--with-filename", dest="grep_opts", const="-H", action="append_const",
                        help="print the file name for each match")
    output.add_argument("-h", "--no-filename", dest="grep_opts", const="-h", action="append_const",
                        help="suppress the file name prefix on output")
    output.add_argument("--label", dest="label", metavar="LABEL", type=str, action=ArgCollectOption,
                        help="use LABEL as the standard input filename prefix")
    output.add_argument("-o", "--only-matching", dest="grep_opts", const="-o", action="append_const",
                        help="show only the part of a line matching PATTERN")
    output.add_argument("-q", "--quiet", "--silent", dest="grep_opts", const="-q", action="append_const",
                        help="suppress all normal output")
    output_type = output.add_mutually_exclusive_group()
    output_type.add_argument("--binary-files", dest="binary_files", metavar="TYPE", choices=['binary', 'text',
                             'without-match'], default=None, action=ArgCollectOption,
                             help="assume that binary files are TYPE;\nTYPE is 'binary', 'text', or 'without-match'")
    output_type.add_argument("-a", "--text", dest="grep_opts", const="-a", action="append_const",
                             help="equivalent to --binary-files=text")
    output_type.add_argument("-I", dest="grep_opts", const="-I", action="append_const",
                             help="equivalent to --binary-files=without-match")
    output_match = output.add_mutually_exclusive_group()
    output_match.add_argument("-L", "--files-without-match", dest="grep_opts", const="-L", action="append_const",
                              help="print only names of FILEs containing no match")
    output_match.add_argument("-l", "--files-with-match", dest="grep_opts", const="-l", action="append_const",
                              help="print only names of FILEs containing matches")
    output.add_argument("-c", "--count", dest="grep_opts", const="-c", action="append_const",
                        help="print only a count of matching lines per FILE")
    output.add_argument("-T", "--initial-tab", dest="grep_opts", const="-T", action="append_const",
                        help="make tabs line up (if needed)")
    output.add_argument("-Z", "--null", dest="grep_opts", const="-Z", action="append_const",
                        help="print 0 byte after FILE name")
    context = parser.add_argument_group("Context control")
    context.add_argument("-B", "--before-context", dest="B", metavar="NUM", type=int, action=ArgCollectOption,
                         help="print NUM lines of leading context")
    context.add_argument("-A", "--after-context", dest="A", metavar="NUM", type=int, action=ArgCollectOption,
                         help="print NUM lines of trailing context")
    context.add_argument("-C", "--context", dest="C", metavar="NUM", type=int, action=ArgCollectOption,
                         help="print NUM lines of output context")
    regex_pat.add_argument("pattern", metavar="PATTERN", nargs='?', default=None, help=argparse.SUPPRESS)
    parser.add_argument("url", metavar="URL", nargs='+', help=argparse.SUPPRESS)
    # ----- START new arguments -----
    web = parser.add_argument_group("Web options")
    web.add_argument("-r", "--local-resources", dest="inc_local", action="store_true",
                     help="also grep local resources (same-origin)")
    web.add_argument("-R", "--all-resources", dest="inc_all", action="store_true",
                     help="also grep all resources (even non-same-origin)")
    web.add_argument("--include-headers", dest="inc_headers", action="store_true", help="also grep HTTP headers")
    web.add_argument("--cookie", action=GetHeader, help="use a session cookie in the HTTP headers")
    web.add_argument("--referer", action=GetHeader, help="provide the referer in the HTTP headers")
    web = parser.add_argument_group("Proxy settings (by default, system proxy settings are used)")
    web.add_argument("-d", "--disable-proxy", dest="proxy", action="store_false", help="manually disable proxy")
    web.add_argument("--http-proxy", dest="http", action=ProxySetting, help="manually set the HTTP proxy")
    web.add_argument("--https-proxy", dest="https", action=ProxySetting, help="manually set the HTTPS proxy")
    # ----- END new arguments -----
    args = parser.parse_args()
    args.cache = None
    if not args.keep:
        try:
            shutil.rmtree(args.tmp)
        except OSError:
            pass
    else:
        try:
            args.cache_file = join(args.tmp, CACHE_FILE)
            with open(args.cache_file) as f:
                args.cache = json.load(f)
        except Exception:
            args.cache = {}
    if args.pattern is None and (args.grep_opts is None or "--file" not in args.grep_opts and \
       "--regexp" not in args.grep_opts):
        parser.error("no pattern provided")
    if args.pattern is not None:
        args.pattern = "\"{}\"".format(args.pattern)
    elif "--regexp" in args.grep_opts:
        i = args.grep_opts.index("--regexp")
        args.grep_opts[i+1] = "\"{}\"".format(args.grep_opts[i+1])
    if not hasattr(args, "headers"):
        args.headers = {}
    # configure logging and get the root logger
    args.verbose = [logging.INFO, logging.DEBUG][args.verbose]
    logging.basicConfig(format=LOG_FORMAT, datefmt=DATE_FORMAT, level=args.verbose)
    logger = logging.getLogger()
    if colored_logs_present:
        coloredlogs.DEFAULT_LOG_FORMAT = LOG_FORMAT
        coloredlogs.DEFAULT_DATE_FORMAT = DATE_FORMAT
        coloredlogs.install(args.verbose)
    # attach tools and preprocessors to args
    for item in ['tools', 'preprocessors']:
        for res in RESOURCE_TYPES:
            a = '{}_{}'.format(res, item).upper()
            if a in globals():
                setattr(args, a.lower(), [(k, v[1]) for k, v in globals()[a].items() \
                                          if __installed(k, v[0], None if args.no_messages else v[2])])
    # running the main stuff ;
    done = []
    #  if a cache exists and --keep-files is used, use it
    if args.cache:
        for url in args.url:
            if url in done:
                continue
            for rel_fn in args.cache.get(url).keys():
                with Resource(rel_fn) as res:
                    res.grep()
        __exit_handler()
    #  otherwise, download the page and its resources
    try:
        session = requests.Session()
        session.mount("file://", LocalFileAdapter())
        logger.debug("Creating temporary folder {}".format(args.tmp))
        logger.debug("Cache {}".format(["enabled", "disabled"][args.cache is None]))
        if not exists(args.tmp):
             os.makedirs(args.tmp)
        for url in args.url:
            if url in done:
                continue
            logger.debug("Downloading page {} and its associated resources".format(url))
            with Resource(url) as page:
                page.grep().handle()
            done.append(url)
    except Exception as e:
        logger.exception("Unexpected error: {}".format(str(e)))
        __exit_handler(code=1)
    # gracefully close after running the main stuff
    __exit_handler()

