package org.apache.pinot.hadoop.job.preprocess;

import java.io.IOException;
import java.util.List;
import java.util.zip.GZIPInputStream;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyOutputFormat;
import org.apache.avro.mapreduce.AvroMultipleOutputs;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.pinot.hadoop.job.mappers.AvroDataPreprocessingMapper;
import org.apache.pinot.hadoop.job.partitioners.AvroDataPreprocessingPartitioner;
import org.apache.pinot.hadoop.job.reducers.AvroDataPreprocessingReducer;
import org.apache.pinot.hadoop.utils.preprocess.HadoopUtils;
import org.apache.pinot.segment.spi.partition.PartitionFunctionFactory;
import org.apache.pinot.shaded.com.google.common.base.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/pinot/hadoop/job/preprocess/AvroDataPreprocessingHelper.class */
public class AvroDataPreprocessingHelper extends DataPreprocessingHelper {
    private static final Logger LOGGER = LoggerFactory.getLogger((Class<?>) AvroDataPreprocessingHelper.class);

    public AvroDataPreprocessingHelper(List<Path> list, Path path) {
        super(list, path);
    }

    @Override // org.apache.pinot.hadoop.job.preprocess.DataPreprocessingHelper
    public Class<? extends Partitioner> getPartitioner() {
        return AvroDataPreprocessingPartitioner.class;
    }

    @Override // org.apache.pinot.hadoop.job.preprocess.DataPreprocessingHelper
    public void setUpMapperReducerConfigs(Job job) throws IOException {
        Schema avroSchema = getAvroSchema(this._sampleRawDataPath);
        LOGGER.info("Avro schema is: {}", avroSchema.toString(true));
        validateConfigsAgainstSchema(avroSchema);
        job.setInputFormatClass(AvroKeyInputFormat.class);
        job.setMapperClass(AvroDataPreprocessingMapper.class);
        job.setReducerClass(AvroDataPreprocessingReducer.class);
        AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, avroSchema);
        AvroMultipleOutputs.setCountersEnabled(job, true);
        LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class);
        job.setOutputKeyClass(AvroKey.class);
        job.setOutputValueClass(NullWritable.class);
        AvroJob.setInputKeySchema(job, avroSchema);
        AvroJob.setMapOutputValueSchema(job, avroSchema);
        AvroJob.setOutputKeySchema(job, avroSchema);
    }

    @Override // org.apache.pinot.hadoop.job.preprocess.DataPreprocessingHelper
    String getSampleTimeColumnValue(String str) throws IOException {
        DataFileStream<GenericRecord> avroReader = getAvroReader(this._sampleRawDataPath);
        try {
            String obj = avroReader.next().get(str).toString();
            if (avroReader != null) {
                avroReader.close();
            }
            return obj;
        } catch (Throwable th) {
            if (avroReader != null) {
                try {
                    avroReader.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private Schema getAvroSchema(Path path) throws IOException {
        Schema schema = null;
        FileStatus[] listStatus = HadoopUtils.DEFAULT_FILE_SYSTEM.listStatus(path);
        int length = listStatus.length;
        int i = 0;
        while (true) {
            if (i >= length) {
                break;
            }
            FileStatus fileStatus = listStatus[i];
            if (fileStatus.isFile() && fileStatus.getPath().getName().endsWith(".avro")) {
                LOGGER.info("Extracting schema from " + fileStatus.getPath());
                DataFileStream<GenericRecord> avroReader = getAvroReader(path);
                try {
                    schema = avroReader.getSchema();
                    if (avroReader != null) {
                        avroReader.close();
                    }
                } catch (Throwable th) {
                    if (avroReader != null) {
                        try {
                            avroReader.close();
                        } catch (Throwable th2) {
                            th.addSuppressed(th2);
                        }
                    }
                    throw th;
                }
            } else {
                i++;
            }
        }
        return schema;
    }

    private DataFileStream<GenericRecord> getAvroReader(Path path) throws IOException {
        FileSystem fileSystem = FileSystem.get(new Configuration());
        return path.getName().endsWith(CompressorStreamFactory.GZIP) ? new DataFileStream<>(new GZIPInputStream(fileSystem.open(path)), new GenericDatumReader()) : new DataFileStream<>(fileSystem.open(path), new GenericDatumReader());
    }

    private void validateConfigsAgainstSchema(Schema schema) {
        if (this._partitionColumn != null) {
            Preconditions.checkArgument(schema.getField(this._partitionColumn) != null, String.format("Partition column: %s is not found from the schema of input files.", this._partitionColumn));
            Preconditions.checkArgument(this._numPartitions > 0, String.format("Number of partitions should be positive. Current value: %s", Integer.valueOf(this._numPartitions)));
            Preconditions.checkArgument(this._partitionFunction != null, "Partition function should not be null!");
            try {
                PartitionFunctionFactory.PartitionFunctionType.fromString(this._partitionFunction);
            } catch (IllegalArgumentException e) {
                LOGGER.error("Partition function needs to be one of Modulo, Murmur, ByteArray, HashCode, it is currently {}", this._partitionColumn);
                throw new IllegalArgumentException(e);
            }
        }
        if (this._sortingColumn != null) {
            Preconditions.checkArgument(schema.getField(this._sortingColumn) != null, String.format("Sorted column: %s is not found from the schema of input files.", this._sortingColumn));
        }
    }
}
