/*
 * Decompiled with CFR 0.152.
 */
package zingg.recommender;

import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.functions;
import zingg.client.Arguments;
import zingg.client.ZinggClientException;
import zingg.util.PipeUtil;

public class StopWordsRecommender {
    public static final Log LOG = LogFactory.getLog(StopWordsRecommender.class);
    protected SparkSession spark;
    protected Dataset<Row> data;
    JavaSparkContext ctx;
    public Arguments args;

    public StopWordsRecommender(SparkSession spark, JavaSparkContext ctx, Arguments args) {
        this.spark = spark;
        this.ctx = ctx;
        this.args = args;
    }

    public void process() throws ZinggClientException {
        LOG.info("Data recommender starts");
        try {
            this.data = PipeUtil.read(this.spark, false, false, this.args.getData());
        }
        catch (ZinggClientException e) {
            LOG.warn("No data has been found");
        }
        if (!this.data.isEmpty()) {
            this.createStopWordsDocuments(this.data, this.args.getColumn(), this.ctx);
        } else {
            LOG.info("No data recommendation generated");
        }
        LOG.info("Data recommender finishes");
    }

    public void createStopWordsDocuments(Dataset<Row> data, String fieldName, JavaSparkContext ctx) throws ZinggClientException {
        if (!data.isEmpty()) {
            if (this.args.getColumn() != null) {
                if (Arrays.asList(data.schema().fieldNames()).contains(this.args.getColumn())) {
                    String filenameCSV = this.args.getStopWordsDir() + fieldName;
                    data = this.findStopWords(data, fieldName);
                    PipeUtil.write(data, this.args, ctx, PipeUtil.getStopWordsPipe(this.args, filenameCSV));
                } else {
                    LOG.info("An invalid column name - " + this.args.getColumn() + " entered. Please provide valid column name.");
                }
            } else {
                LOG.info("Please provide '--column <columnName>' option at command line to generate stop words for that column.");
            }
        } else {
            LOG.info("No stopwords generated");
        }
    }

    public Dataset<Row> findStopWords(Dataset<Row> data, String fieldName) {
        LOG.debug("Field: " + fieldName);
        if (!data.isEmpty()) {
            data = data.select(new Column[]{functions.split((Column)data.col(fieldName), (String)"\\s+").as("z_split")});
            data = data.select(new Column[]{functions.explode((Column)data.col("z_split")).as("z_word")});
            data = data.filter(data.col("z_word").notEqual((Object)""));
            data = data.groupBy("z_word", new String[0]).count().withColumnRenamed("count", "z_count");
            long count = ((Row)data.agg(functions.sum((String)"z_count"), new Column[0]).collectAsList().get(0)).getLong(0);
            double threshold = (float)count * this.args.getStopWordsCutoff();
            data = data.filter(data.col("z_count").gt((Object)threshold));
            data = data.coalesce(1);
        }
        return data;
    }
}

