# AUTOGENERATED! DO NOT EDIT! File to edit: ../../notebooks/02_preprocess_data.ipynb.

# %% auto 0
__all__ = ['process_data']

# %% ../../notebooks/02_preprocess_data.ipynb 14
import os
import subprocess
import logging
import hydra

from .download import download_datasets
from rocks_classifier.data.utils import (
    clean_download_files,
    find_filepaths,
    get_df,
    get_value_counts,
    move_to_processed,
    sampling,
    clean_images,
)

# %% ../../notebooks/02_preprocess_data.ipynb 15
# @hydra.main(config_path="../../configs", config_name="config", version_base="1.2")
def process_data():
    """Download dataset, removes unsupported and corrupted images, and splits data into train, val and test.

    Parameters
    ----------
    cfg : cfg (omegaconf.DictConfig):
        Hydra Configuration
    """
    import omegaconf
    path = 'configs/config.yaml'
    cfg = omegaconf.OmegaConf.load(path)
    
    clean_download_files()
    download_datasets()
    move_to_processed()

    print("\n\nFiles other than jpg and png.\n")
    files, _ = find_filepaths('data/2_processed/')
    print('\n'.join(list(filter(lambda x: not x.endswith('jpg') and not x.endswith('png'), files))))

    print("\nFile types before cleaning:")
    get_value_counts("data/2_processed")

    clean_images(cfg)

    print("\nFile types after cleaning:")
    get_value_counts("data/2_processed")

    print("\nCounts of classes:\n")
    get_value_counts("data/2_processed", column="class")

    sampling(cfg)

