/*
 * Decompiled with CFR 0.152.
 */
package zingg.preprocess;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.sql.Column;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.expressions.UserDefinedFunction;
import org.apache.spark.sql.functions;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import zingg.client.Arguments;
import zingg.client.FieldDefinition;
import zingg.client.ZinggClientException;
import zingg.util.PipeUtil;

public class StopWords {
    protected static String name = "zingg.preprocess.StopWords";
    public static final Log LOG = LogFactory.getLog(StopWords.class);
    protected static String stopWordColumn = "z_word";
    protected static final int COLUMN_INDEX_DEFAULT = 0;

    public static Dataset<Row> preprocessForStopWords(SparkSession spark, Arguments args, Dataset<Row> ds) throws ZinggClientException {
        List wordList = new ArrayList();
        for (FieldDefinition def : args.getFieldDefinition()) {
            if (def.getStopWords() == null || def.getStopWords() == "") continue;
            Dataset<Row> stopWords = PipeUtil.read(spark, false, false, PipeUtil.getStopWordsPipe(args, def.getStopWords()));
            if (!Arrays.asList(stopWords.schema().fieldNames()).contains(stopWordColumn)) {
                stopWordColumn = stopWords.columns()[0];
            }
            wordList = stopWords.select(stopWordColumn, new String[0]).as(Encoders.STRING()).collectAsList();
            String pattern = wordList.stream().collect(Collectors.joining("|", "\\b(", ")\\b\\s?"));
            ds = ds.withColumn(def.getFieldName(), StopWords.removeStopWords(pattern.toLowerCase()).apply(new Column[]{ds.col(def.getFieldName())}));
        }
        return ds;
    }

    public static UserDefinedFunction removeStopWords(String stopWordsRegexString) {
        return functions.udf((UDF1 & Serializable)s -> {
            if (s == null) {
                return null;
            }
            return s.toLowerCase().replaceAll(stopWordsRegexString, "");
        }, (DataType)DataTypes.StringType);
    }
}

