diff --git a/.github/dependabot.yml b/.github/dependabot.yml index eab4d572c3..dee7d3472b 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -29,3 +29,4 @@ updates: schedule: interval: "weekly" day: "sunday" + open-pull-requests-limit: 50 diff --git a/.github/workflows/stale-prs.yml b/.github/workflows/stale-prs.yml new file mode 100644 index 0000000000..fe726c4812 --- /dev/null +++ b/.github/workflows/stale-prs.yml @@ -0,0 +1,55 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +name: Stale pull requests + +on: + schedule: + - cron: '0 0 * * *' + workflow_dispatch: + +permissions: + pull-requests: write + issues: write + +jobs: + stale: + runs-on: ubuntu-slim + steps: + - name: Mark and close stale pull requests + uses: actions/stale@v9 + with: + repo-token: ${{ secrets.GITHUB_TOKEN }} + # Don't touch issues. + days-before-issue-stale: -1 + days-before-issue-close: -1 + # PRs stale after 2 months of inactivity and closed 1 month later. + days-before-pr-stale: 60 + days-before-pr-close: 30 + stale-pr-label: stale + stale-pr-message: > + This pull request has been automatically marked as stale because it has + had no activity for at least 2 months. If you are still working on this + change or plan to move it forward, please leave a comment or push a new + commit so we know to keep it open. Otherwise, this PR will be closed + automatically in about one month. Thank you for your contribution to + Apache Parquet! + close-pr-message: > + Closing this pull request due to at least 3 months of inactivity. If you + would like to continue the work, please feel free to reopen this pull + request or open a new one. Thank you for your contribution to + Apache Parquet! diff --git a/README.md b/README.md index ff6f162151..221970ee93 100644 --- a/README.md +++ b/README.md @@ -153,29 +153,29 @@ The build runs in [GitHub Actions](https://github.com/apache/parquet-java/action ## Add Parquet as a dependency in Maven -The current release is version `1.15.1`. +The current release is version `1.17.0`. ```xml org.apache.parquet parquet-common - 1.15.1 + 1.17.0 org.apache.parquet parquet-encoding - 1.15.1 + 1.17.0 org.apache.parquet parquet-column - 1.15.1 + 1.17.0 org.apache.parquet parquet-hadoop - 1.15.1 + 1.17.0 ``` diff --git a/dev/ci-before_install.sh b/dev/ci-before_install.sh index 6b57624a23..2cfaee56a3 100755 --- a/dev/ci-before_install.sh +++ b/dev/ci-before_install.sh @@ -23,14 +23,24 @@ export THRIFT_VERSION=0.22.0 set -e +set -o pipefail date sudo apt-get update -qq sudo apt-get install -qq --no-install-recommends build-essential pv autoconf automake libtool curl make \ - g++ unzip libboost-dev libboost-test-dev libboost-program-options-dev \ + g++ unzip libboost-dev libboost-test-dev libboost-program-options-dev wget \ libevent-dev automake libtool flex bison pkg-config g++ libssl-dev xmlstarlet date pwd -wget -qO- https://archive.apache.org/dist/thrift/$THRIFT_VERSION/thrift-$THRIFT_VERSION.tar.gz | tar zxf - +for attempt in 1 2 3; do + if wget -nv -O- https://archive.apache.org/dist/thrift/$THRIFT_VERSION/thrift-$THRIFT_VERSION.tar.gz | tar zxf -; then + break + fi + if [[ "$attempt" -eq 3 ]]; then + echo "Failed to download thrift after ${attempt} attempts." >&2 + exit 1 + fi + sleep $((attempt * 5)) +done cd thrift-${THRIFT_VERSION} chmod +x ./configure ./configure --disable-libs diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml index de7a3d6789..d9a9e8fb28 100644 --- a/parquet-arrow/pom.xml +++ b/parquet-arrow/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.17.0-SNAPSHOT + 1.18.0-SNAPSHOT 4.0.0 @@ -33,7 +33,7 @@ https://parquet.apache.org - 17.0.0 + 19.0.0 diff --git a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java index 532fa7c53a..e779b6b8c2 100644 --- a/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java +++ b/parquet-arrow/src/main/java/org/apache/parquet/arrow/schema/SchemaConverter.java @@ -179,6 +179,11 @@ public TypeMapping visit(ArrowType.ListView type) { return createListTypeMapping(); } + @Override + public TypeMapping visit(ArrowType.LargeListView type) { + return createListTypeMapping(); + } + private ListTypeMapping createListTypeMapping() { if (children.size() != 1) { throw new IllegalArgumentException("list fields must have exactly one child: " + field); @@ -349,6 +354,11 @@ public TypeMapping visit(ArrowType.Duration duration) { return primitiveFLBA(12, LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance()); } + @Override + public TypeMapping visit(ArrowType.RunEndEncoded runEndEncoded) { + throw new UnsupportedOperationException("Unsupported type " + runEndEncoded); + } + @Override public TypeMapping visit(ArrowType.ExtensionType type) { return ArrowTypeVisitor.super.visit(type); @@ -769,6 +779,11 @@ public TypeMapping visit(ArrowType.ListView type) { return createListTypeMapping(type); } + @Override + public TypeMapping visit(ArrowType.LargeListView type) { + return createListTypeMapping(type); + } + private TypeMapping createListTypeMapping(ArrowType.ComplexType type) { if (arrowField.getChildren().size() != 1) { throw new IllegalArgumentException("Invalid list type: " + type); @@ -893,6 +908,11 @@ public TypeMapping visit(ArrowType.Duration duration) { return primitive(); } + @Override + public TypeMapping visit(ArrowType.RunEndEncoded runEndEncoded) { + throw new UnsupportedOperationException("Unsupported type " + runEndEncoded); + } + @Override public TypeMapping visit(ArrowType.FixedSizeBinary fixedSizeBinary) { return primitive(); diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml index 68815da263..3c84eb66d0 100644 --- a/parquet-avro/pom.xml +++ b/parquet-avro/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.17.0-SNAPSHOT + 1.18.0-SNAPSHOT 4.0.0 @@ -112,25 +112,7 @@ org.mockito mockito-core - 2.23.0 - test - - - org.powermock - powermock-module-junit4 - ${powermock.version} - test - - - org.powermock - powermock-core - ${powermock.version} - test - - - org.powermock - powermock-api-mockito2 - ${powermock.version} + ${mockito.version} test diff --git a/parquet-avro/src/main/java/org/apache/parquet/avro/package-info.java b/parquet-avro/src/main/java/org/apache/parquet/avro/package-info.java index fe412e001b..7daff59277 100644 --- a/parquet-avro/src/main/java/org/apache/parquet/avro/package-info.java +++ b/parquet-avro/src/main/java/org/apache/parquet/avro/package-info.java @@ -77,7 +77,7 @@ * * map * group (with original type MAP) containing one repeated group - * field (with original type MAP_KEY_VALUE) of (key, value) + * field of (key, value) * * * fixed @@ -148,7 +148,7 @@ * * * group (with original type MAP) containing one repeated group - * field (with original type MAP_KEY_VALUE) of (key, value) + * field of (key, value) * map * * diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroRecordConverter.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroRecordConverter.java index 315320bbdc..7c1db48f46 100644 --- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroRecordConverter.java +++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroRecordConverter.java @@ -34,22 +34,25 @@ import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; import org.apache.avro.specific.SpecificData; +import org.junit.After; import org.junit.Before; import org.junit.Test; -import org.junit.runner.RunWith; +import org.mockito.MockedStatic; import org.mockito.Mockito; -import org.powermock.api.mockito.PowerMockito; -import org.powermock.core.classloader.annotations.PrepareForTest; -import org.powermock.modules.junit4.PowerMockRunner; -@RunWith(PowerMockRunner.class) -@PrepareForTest(AvroRecordConverter.class) public class TestAvroRecordConverter { + private MockedStatic avroRecordConverterMock; + @Before public void setup() { // Default to calling real methods unless overridden in specific test - PowerMockito.mockStatic(AvroRecordConverter.class, CALLS_REAL_METHODS); + avroRecordConverterMock = Mockito.mockStatic(AvroRecordConverter.class, CALLS_REAL_METHODS); + } + + @After + public void tearDown() { + avroRecordConverterMock.close(); } @Test @@ -86,7 +89,7 @@ public void testModelForGenericRecord() { // Test logical type support for older Avro versions @Test public void testModelForSpecificRecordWithLogicalTypesWithDeprecatedAvro1_8() { - Mockito.when(AvroRecordConverter.getRuntimeAvroVersion()).thenReturn("1.8.2"); + avroRecordConverterMock.when(AvroRecordConverter::getRuntimeAvroVersion).thenReturn("1.8.2"); // Test that model is generated correctly when record contains both top-level and nested logical types SpecificData model = AvroRecordConverter.getModelForSchema(LogicalTypesTestDeprecated.SCHEMA$); @@ -108,7 +111,7 @@ public void testModelForSpecificRecordWithLogicalTypesWithDeprecatedAvro1_8() { @Test public void testModelForSpecificRecordWithLogicalTypesWithDeprecatedAvro1_7() { - Mockito.when(AvroRecordConverter.getRuntimeAvroVersion()).thenReturn("1.7.7"); + avroRecordConverterMock.when(AvroRecordConverter::getRuntimeAvroVersion).thenReturn("1.7.7"); // Test that model is generated correctly final SpecificData model = AvroRecordConverter.getModelForSchema(LogicalTypesTestDeprecated.SCHEMA$); diff --git a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java index 346fafe7d3..412e8f2957 100644 --- a/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java +++ b/parquet-avro/src/test/java/org/apache/parquet/avro/TestAvroSchemaConverter.java @@ -62,25 +62,27 @@ import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName; import org.apache.parquet.schema.Type; import org.apache.parquet.schema.Types; +import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; -import org.junit.runner.RunWith; +import org.mockito.MockedStatic; import org.mockito.Mockito; -import org.powermock.api.mockito.PowerMockito; -import org.powermock.core.classloader.annotations.PrepareForTest; -import org.powermock.modules.junit4.PowerMockRunner; -@RunWith(PowerMockRunner.class) -@PrepareForTest(AvroRecordConverter.class) public class TestAvroSchemaConverter { private static final Configuration NEW_BEHAVIOR = new Configuration(false); + private MockedStatic avroRecordConverterMock; @Before public void setupMockito() { - PowerMockito.mockStatic(AvroRecordConverter.class, CALLS_REAL_METHODS); + avroRecordConverterMock = Mockito.mockStatic(AvroRecordConverter.class, CALLS_REAL_METHODS); + } + + @After + public void tearDown() { + avroRecordConverterMock.close(); } @BeforeClass @@ -119,7 +121,7 @@ public static void setupConf() { + " }\n" + " }\n" + " required group mymap (MAP) {\n" - + " repeated group map (MAP_KEY_VALUE) {\n" + + " repeated group map {\n" + " required binary key (UTF8);\n" + " required int32 value;\n" + " }\n" @@ -212,13 +214,13 @@ public void testAllTypes() throws Exception { + " }\n" + " }\n" + " required group mymap (MAP) {\n" - + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " repeated group key_value {\n" + " required binary key (UTF8);\n" + " required int32 value;\n" + " }\n" + " }\n" + " required group myemptymap (MAP) {\n" - + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " repeated group key_value {\n" + " required binary key (UTF8);\n" + " required int32 value;\n" + " }\n" @@ -259,13 +261,13 @@ public void testAllTypesOldListBehavior() throws Exception { + " repeated int32 array;\n" + " }\n" + " required group mymap (MAP) {\n" - + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " repeated group key_value {\n" + " required binary key (UTF8);\n" + " required int32 value;\n" + " }\n" + " }\n" + " required group myemptymap (MAP) {\n" - + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " repeated group key_value {\n" + " required binary key (UTF8);\n" + " required int32 value;\n" + " }\n" @@ -320,7 +322,7 @@ public void testOptionalMapValue() throws Exception { testRoundTripConversion( schema, "message record1 {\n" + " required group myintmap (MAP) {\n" - + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " repeated group key_value {\n" + " required binary key (UTF8);\n" + " optional int32 value;\n" + " }\n" @@ -706,7 +708,9 @@ public void testTimestampMillisType() throws Exception { // Test that conversions for timestamp types only use APIs that are available in the user's Avro version for (String avroVersion : ImmutableSet.of("1.7.0", "1.8.0", "1.9.0", "1.10.0", "1.11.0")) { - Mockito.when(AvroRecordConverter.getRuntimeAvroVersion()).thenReturn(avroVersion); + avroRecordConverterMock + .when(AvroRecordConverter::getRuntimeAvroVersion) + .thenReturn(avroVersion); final Schema converted = new AvroSchemaConverter() .convert(Types.buildMessage() .addField(Types.primitive(INT64, Type.Repetition.REQUIRED) @@ -792,7 +796,9 @@ public void testTimestampMicrosType() throws Exception { // Test that conversions for timestamp types only use APIs that are available in the user's Avro version for (String avroVersion : ImmutableSet.of("1.7.0", "1.8.0", "1.9.0", "1.10.0", "1.11.0")) { - Mockito.when(AvroRecordConverter.getRuntimeAvroVersion()).thenReturn(avroVersion); + avroRecordConverterMock + .when(AvroRecordConverter::getRuntimeAvroVersion) + .thenReturn(avroVersion); final Schema converted = new AvroSchemaConverter() .convert(Types.buildMessage() .addField(Types.primitive(INT64, Type.Repetition.REQUIRED) @@ -971,7 +977,7 @@ public void testAvroFixed12AsParquetInt96Type() throws Exception { + " repeated int96 array;\n" + " }\n" + " required group mymap (MAP) {\n" - + " repeated group key_value (MAP_KEY_VALUE) {\n" + + " repeated group key_value {\n" + " required binary key (STRING);\n" + " required int96 value;\n" + " }\n" diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml index b1003c2808..65d6dbf3ed 100644 --- a/parquet-benchmarks/pom.xml +++ b/parquet-benchmarks/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.17.0-SNAPSHOT + 1.18.0-SNAPSHOT 4.0.0 diff --git a/parquet-cli/pom.xml b/parquet-cli/pom.xml index 86f895d370..3585287638 100644 --- a/parquet-cli/pom.xml +++ b/parquet-cli/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.17.0-SNAPSHOT + 1.18.0-SNAPSHOT 4.0.0 @@ -97,7 +97,7 @@ com.google.protobuf protobuf-java - 4.33.2 + 4.34.1 test @@ -178,7 +178,7 @@ ${jackson.groupId} jackson-annotations - ${jackson.version} + ${jackson-annotations.version} ${jackson.datatype.groupId} @@ -239,7 +239,7 @@ commons-logging commons-logging - 1.3.5 + 1.3.6 ${deps.scope} diff --git a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java index 6de1c7badc..6694fb8ee5 100644 --- a/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java +++ b/parquet-cli/src/main/java/org/apache/parquet/cli/commands/ColumnSizeCommand.java @@ -24,8 +24,11 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; @@ -56,6 +59,18 @@ public ColumnSizeCommand(Logger console) { required = false) List columns; + @Parameter( + names = {"-s", "--sort"}, + description = "Sort columns by size in descending order", + required = false) + boolean sortBySize = false; + + @Parameter( + names = {"-p", "--percentage"}, + description = "Print ratio as percentage instead of decimal", + required = false) + boolean printAsPercentage = false; + @Override @SuppressWarnings("unchecked") public int run() throws IOException { @@ -67,6 +82,10 @@ public int run() throws IOException { // If user defined columns, only print out size for those columns if (columns != null && !columns.isEmpty()) { + // Collect aggregated column data + Map aggregatedSizes = new LinkedHashMap<>(); + Map aggregatedRatios = new LinkedHashMap<>(); + for (String inputColumn : columns) { long size = 0; float ratio = 0; @@ -76,18 +95,52 @@ public int run() throws IOException { ratio += columnRatio.get(column); } } - console.info(inputColumn + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratio); + aggregatedSizes.put(inputColumn, size); + aggregatedRatios.put(inputColumn, ratio); + } + + // Sort if requested + List> entries = new ArrayList<>(aggregatedSizes.entrySet()); + if (sortBySize) { + entries.sort(Map.Entry.comparingByValue().reversed()); + } + + // Print results + for (Map.Entry entry : entries) { + String column = entry.getKey(); + long size = entry.getValue(); + float ratio = aggregatedRatios.get(column); + String ratioStr = formatRatio(ratio); + console.info(column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr); } } else { - for (String column : columnSizes.keySet()) { - console.info(column + "->" + " Size In Bytes: " + columnSizes.get(column) + " Size In Ratio: " - + columnRatio.get(column)); + // Sort if requested + List> entries = new ArrayList<>(columnSizes.entrySet()); + if (sortBySize) { + entries.sort(Map.Entry.comparingByValue().reversed()); + } + + // Print results + for (Map.Entry entry : entries) { + String column = entry.getKey(); + long size = entry.getValue(); + float ratio = columnRatio.get(column); + String ratioStr = formatRatio(ratio); + console.info(column + "->" + " Size In Bytes: " + size + " Size In Ratio: " + ratioStr); } } return 0; } + private String formatRatio(float ratio) { + if (printAsPercentage) { + return String.format(Locale.US, "%.4f%%", ratio * 100); + } else { + return String.valueOf(ratio); + } + } + @Override public List getExamples() { return Lists.newArrayList( @@ -96,7 +149,16 @@ public List getExamples() { "sample.parquet -c col_1", "sample.parquet --column col_2", "sample.parquet --columns col_1 col_2", - "sample.parquet --columns col_1 col_2.sub_col_a"); + "sample.parquet --columns col_1 col_2.sub_col_a", + "# Sort columns by size in descending order", + "sample.parquet --sort", + "sample.parquet -s", + "# Print ratio as percentage", + "sample.parquet --percentage", + "sample.parquet -p", + "# Combine sorting and percentage formatting", + "sample.parquet --sort --percentage", + "sample.parquet -s -p -c col_1 col_2"); } // Make it public to allow some automation tools to call it diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index 8bd11dcc26..f9d976980e 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.17.0-SNAPSHOT + 1.18.0-SNAPSHOT 4.0.0 @@ -90,7 +90,7 @@ org.mockito - mockito-all + mockito-core ${mockito.version} test @@ -100,6 +100,12 @@ ${commons-lang3.version} test + + org.hamcrest + hamcrest-core + 1.3 + test + diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java index 87d39bf16e..9488a38494 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/BinaryStatistics.java @@ -105,7 +105,7 @@ String stringify(Binary value) { @Override public boolean isSmallerThan(long size) { - return !hasNonNullValue() || ((min.length() + max.length()) < size); + return !hasNonNullValue() || (((long) min.length() + max.length()) < size); } public boolean isSmallerThanWithTruncation(long size, int truncationLength) { @@ -113,8 +113,8 @@ public boolean isSmallerThanWithTruncation(long size, int truncationLength) { return true; } - int minTruncateLength = Math.min(min.length(), truncationLength); - int maxTruncateLength = Math.min(max.length(), truncationLength); + long minTruncateLength = Math.min(min.length(), truncationLength); + long maxTruncateLength = Math.min(max.length(), truncationLength); return minTruncateLength + maxTruncateLength < size; } diff --git a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java index 206ddadadc..bee9877738 100644 --- a/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java +++ b/parquet-column/src/main/java/org/apache/parquet/column/statistics/Statistics.java @@ -142,10 +142,6 @@ public Statistics build() { // Builder for FLOAT16 type to handle special cases of min/max values like NaN, -0.0, and 0.0 private static class Float16Builder extends Builder { - private static final Binary POSITIVE_ZERO_LITTLE_ENDIAN = Binary.fromConstantByteArray(new byte[] {0x00, 0x00}); - private static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN = - Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}); - public Float16Builder(PrimitiveType type) { super(type); assert type.getPrimitiveTypeName() == PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; @@ -162,15 +158,17 @@ public Statistics build() { short max = bMax.get2BytesLittleEndian(); // Drop min/max values in case of NaN as the sorting order of values is undefined for this case if (Float16.isNaN(min) || Float16.isNaN(max)) { - stats.setMinMax(POSITIVE_ZERO_LITTLE_ENDIAN, NEGATIVE_ZERO_LITTLE_ENDIAN); + stats.setMinMax(Float16.POSITIVE_ZERO_LITTLE_ENDIAN, Float16.POSITIVE_ZERO_LITTLE_ENDIAN); ((Statistics) stats).hasNonNullValue = false; } else { // Updating min to -0.0 and max to +0.0 to ensure that no 0.0 values would be skipped if (min == (short) 0x0000) { - stats.setMinMax(NEGATIVE_ZERO_LITTLE_ENDIAN, bMax); + bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN; + stats.setMinMax(bMin, bMax); } if (max == (short) 0x8000) { - stats.setMinMax(bMin, POSITIVE_ZERO_LITTLE_ENDIAN); + bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN; + stats.setMinMax(bMin, bMax); } } } diff --git a/parquet-column/src/main/java/org/apache/parquet/example/data/simple/SimpleGroup.java b/parquet-column/src/main/java/org/apache/parquet/example/data/simple/SimpleGroup.java index 7632081390..5f4873220e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/example/data/simple/SimpleGroup.java +++ b/parquet-column/src/main/java/org/apache/parquet/example/data/simple/SimpleGroup.java @@ -230,7 +230,7 @@ public void add(int fieldIndex, Binary value) { break; default: throw new UnsupportedOperationException( - getType().asPrimitiveType().getName() + " not supported for Binary"); + getType().getType(fieldIndex).asPrimitiveType().getName() + " not supported for Binary"); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicate.java b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicate.java index c2aab2b6bf..7aea56b2a0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicate.java +++ b/parquet-column/src/main/java/org/apache/parquet/filter2/recordlevel/IncrementallyUpdatedFilterPredicate.java @@ -18,7 +18,7 @@ */ package org.apache.parquet.filter2.recordlevel; -import java.util.Arrays; +import java.util.List; import java.util.Objects; import org.apache.parquet.io.api.Binary; @@ -153,7 +153,7 @@ abstract static class DelegatingValueInspector extends ValueInspector { private final Iterable delegates; DelegatingValueInspector(ValueInspector... delegates) { - this.delegates = Arrays.asList(delegates); + this.delegates = List.of(delegates); } /** diff --git a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java index 1c546b5160..24de97d01e 100644 --- a/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java +++ b/parquet-column/src/main/java/org/apache/parquet/internal/column/columnindex/BinaryColumnIndexBuilder.java @@ -23,6 +23,8 @@ import java.util.List; import org.apache.parquet.filter2.predicate.Statistics; import org.apache.parquet.io.api.Binary; +import org.apache.parquet.schema.Float16; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.PrimitiveComparator; import org.apache.parquet.schema.PrimitiveType; @@ -82,6 +84,8 @@ int compareValueToMax(int arrayIndex) { private final List maxValues = new ArrayList<>(); private final BinaryTruncator truncator; private final int truncateLength; + private final boolean isFloat16; + private boolean invalid; private static Binary convert(ByteBuffer buffer) { return Binary.fromReusedByteBuffer(buffer); @@ -94,6 +98,7 @@ private static ByteBuffer convert(Binary value) { BinaryColumnIndexBuilder(PrimitiveType type, int truncateLength) { truncator = BinaryTruncator.getTruncator(type); this.truncateLength = truncateLength; + this.isFloat16 = type.getLogicalTypeAnnotation() instanceof LogicalTypeAnnotation.Float16LogicalTypeAnnotation; } @Override @@ -104,12 +109,43 @@ void addMinMaxFromBytes(ByteBuffer min, ByteBuffer max) { @Override void addMinMax(Object min, Object max) { - minValues.add(min == null ? null : truncator.truncateMin((Binary) min, truncateLength)); - maxValues.add(max == null ? null : truncator.truncateMax((Binary) max, truncateLength)); + Binary bMin = (Binary) min; + Binary bMax = (Binary) max; + + if (isFloat16 && bMin != null && bMax != null) { + if (bMin.length() != LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES + || bMax.length() != LogicalTypeAnnotation.Float16LogicalTypeAnnotation.BYTES) { + // Should not happen for Float16 + invalid = true; + } else { + short sMin = bMin.get2BytesLittleEndian(); + short sMax = bMax.get2BytesLittleEndian(); + + if (Float16.isNaN(sMin) || Float16.isNaN(sMax)) { + invalid = true; + } + + // Sorting order is undefined for -0.0 so let min = -0.0 and max = +0.0 to + // ensure that no 0.0 values are skipped + // +0.0 is 0x0000, -0.0 is 0x8000 (little endian: 00 00, 00 80) + if (sMin == (short) 0x0000) { + bMin = Float16.NEGATIVE_ZERO_LITTLE_ENDIAN; + } + if (sMax == (short) 0x8000) { + bMax = Float16.POSITIVE_ZERO_LITTLE_ENDIAN; + } + } + } + + minValues.add(bMin == null ? null : truncator.truncateMin(bMin, truncateLength)); + maxValues.add(bMax == null ? null : truncator.truncateMax(bMax, truncateLength)); } @Override ColumnIndexBase createColumnIndex(PrimitiveType type) { + if (invalid) { + return null; + } BinaryColumnIndex columnIndex = new BinaryColumnIndex(type); columnIndex.minValues = minValues.toArray(new Binary[0]); columnIndex.maxValues = maxValues.toArray(new Binary[0]); diff --git a/parquet-column/src/main/java/org/apache/parquet/io/RecordReaderImplementation.java b/parquet-column/src/main/java/org/apache/parquet/io/RecordReaderImplementation.java index ac2f74be6e..46c4b714a2 100644 --- a/parquet-column/src/main/java/org/apache/parquet/io/RecordReaderImplementation.java +++ b/parquet-column/src/main/java/org/apache/parquet/io/RecordReaderImplementation.java @@ -488,6 +488,6 @@ protected Converter getRecordConsumer() { protected Iterable getColumnReaders() { // Converting the array to an iterable ensures that the array cannot be altered - return Arrays.asList(columnReaders); + return List.of(columnReaders); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java b/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java index e4ede9f15a..2b812ff601 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/ConversionPatterns.java @@ -75,11 +75,7 @@ public static GroupType mapType( repetition, alias, LogicalTypeAnnotation.mapType(), - new GroupType( - Repetition.REPEATED, - mapAlias, - LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(), - keyType)); + new GroupType(Repetition.REPEATED, mapAlias, keyType)); } else { if (!valueType.getName().equals("value")) { throw new RuntimeException(valueType.getName() + " should be value"); @@ -88,12 +84,7 @@ public static GroupType mapType( repetition, alias, LogicalTypeAnnotation.mapType(), - new GroupType( - Repetition.REPEATED, - mapAlias, - LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(), - keyType, - valueType)); + new GroupType(Repetition.REPEATED, mapAlias, keyType, valueType)); } } diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java index 6fe0e3d4c3..fc146a8890 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Float16.java @@ -46,6 +46,13 @@ * Ref: https://android.googlesource.com/platform/libcore/+/master/luni/src/main/java/libcore/util/FP16.java */ public class Float16 { + // Positive zero of type half-precision float. + public static final Binary POSITIVE_ZERO_LITTLE_ENDIAN = + Binary.fromConstantByteArray(new byte[] {0x00, 0x00}, 0, 2); + // Negative zero of type half-precision float. + public static final Binary NEGATIVE_ZERO_LITTLE_ENDIAN = + Binary.fromConstantByteArray(new byte[] {0x00, (byte) 0x80}, 0, 2); + // Positive infinity of type half-precision float. private static final short POSITIVE_INFINITY = (short) 0x7c00; // A Not-a-Number representation of a half-precision float. diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java index be98e071f6..98bc5c0237 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/LogicalTypeAnnotation.java @@ -18,7 +18,6 @@ */ package org.apache.parquet.schema; -import static java.util.Arrays.asList; import static java.util.Optional.empty; import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.TYPE_DEFINED_ORDER; import static org.apache.parquet.schema.ColumnOrder.ColumnOrderName.UNDEFINED; @@ -33,8 +32,6 @@ import static org.apache.parquet.schema.PrimitiveStringifier.TIME_STRINGIFIER; import static org.apache.parquet.schema.PrimitiveStringifier.TIME_UTC_STRINGIFIER; -import java.util.Collections; -import java.util.HashSet; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -822,8 +819,7 @@ PrimitiveStringifier valueStringifier(PrimitiveType primitiveType) { } public static class IntLogicalTypeAnnotation extends LogicalTypeAnnotation { - private static final Set VALID_BIT_WIDTH = - Collections.unmodifiableSet(new HashSet<>(asList(8, 16, 32, 64))); + private static final Set VALID_BIT_WIDTH = Set.of(8, 16, 32, 64); private final int bitWidth; private final boolean isSigned; diff --git a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java index 7298a356b4..2f12991ab0 100644 --- a/parquet-column/src/main/java/org/apache/parquet/schema/Types.java +++ b/parquet-column/src/main/java/org/apache/parquet/schema/Types.java @@ -110,7 +110,7 @@ * *
  *   // required group zipMap (MAP) {
- *   //   repeated group map (MAP_KEY_VALUE) {
+ *   //   repeated group map {
  *   //     required float key
  *   //     optional int32 value
  *   //   }
@@ -122,7 +122,7 @@
  *
  *
  *   // required group zipMap (MAP) {
- *   //   repeated group map (MAP_KEY_VALUE) {
+ *   //   repeated group map {
  *   //     required group key {
  *   //       optional int64 first;
  *   //       required group second {
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
index 813298c2b6..786d2be2c3 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestSizeStatistics.java
@@ -18,8 +18,8 @@
  */
 package org.apache.parquet.column.statistics;
 
-import java.util.Arrays;
 import java.util.Collections;
+import java.util.List;
 import java.util.Optional;
 import org.apache.parquet.io.api.Binary;
 import org.apache.parquet.schema.LogicalTypeAnnotation;
@@ -47,8 +47,8 @@ public void testAddBinaryType() {
     builder.add(1, 1);
     SizeStatistics statistics = builder.build();
     Assert.assertEquals(Optional.of(3L), statistics.getUnencodedByteArrayDataBytes());
-    Assert.assertEquals(Arrays.asList(3L, 3L, 1L), statistics.getRepetitionLevelHistogram());
-    Assert.assertEquals(Arrays.asList(2L, 2L, 3L), statistics.getDefinitionLevelHistogram());
+    Assert.assertEquals(List.of(3L, 3L, 1L), statistics.getRepetitionLevelHistogram());
+    Assert.assertEquals(List.of(2L, 2L, 3L), statistics.getDefinitionLevelHistogram());
   }
 
   @Test
@@ -67,7 +67,7 @@ public void testAddNonBinaryType() {
     builder.add(1, 0);
     SizeStatistics statistics = builder.build();
     Assert.assertEquals(Optional.empty(), statistics.getUnencodedByteArrayDataBytes());
-    Assert.assertEquals(Arrays.asList(2L, 4L), statistics.getRepetitionLevelHistogram());
+    Assert.assertEquals(List.of(2L, 4L), statistics.getRepetitionLevelHistogram());
     Assert.assertEquals(Collections.emptyList(), statistics.getDefinitionLevelHistogram());
   }
 
@@ -89,8 +89,8 @@ public void testMergeStatistics() {
     SizeStatistics statistics2 = builder2.build();
     statistics1.mergeStatistics(statistics2);
     Assert.assertEquals(Optional.of(5L), statistics1.getUnencodedByteArrayDataBytes());
-    Assert.assertEquals(Arrays.asList(3L, 1L, 1L), statistics1.getRepetitionLevelHistogram());
-    Assert.assertEquals(Arrays.asList(1L, 3L, 1L), statistics1.getDefinitionLevelHistogram());
+    Assert.assertEquals(List.of(3L, 1L, 1L), statistics1.getRepetitionLevelHistogram());
+    Assert.assertEquals(List.of(1L, 3L, 1L), statistics1.getDefinitionLevelHistogram());
   }
 
   @Test
@@ -122,8 +122,8 @@ public void testCopyStatistics() {
     SizeStatistics statistics = builder.build();
     SizeStatistics copy = statistics.copy();
     Assert.assertEquals(Optional.of(3L), copy.getUnencodedByteArrayDataBytes());
-    Assert.assertEquals(Arrays.asList(1L, 1L, 1L), copy.getRepetitionLevelHistogram());
-    Assert.assertEquals(Arrays.asList(1L, 1L, 1L), copy.getDefinitionLevelHistogram());
+    Assert.assertEquals(List.of(1L, 1L, 1L), copy.getRepetitionLevelHistogram());
+    Assert.assertEquals(List.of(1L, 1L, 1L), copy.getDefinitionLevelHistogram());
   }
 
   @Test
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
index dec244f629..92eaa7a302 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/statistics/TestStatistics.java
@@ -927,4 +927,15 @@ public void testNoopStatistics() {
     assertThrows(UnsupportedOperationException.class, stats::minAsString);
     assertThrows(UnsupportedOperationException.class, () -> stats.isSmallerThan(0));
   }
+
+  @Test
+  public void testBinaryIsSmallerThanNoOverflowForLargeValues() {
+    BinaryStatistics stats = new BinaryStatistics();
+    // Create a Binary whose length() reports 2^30 without allocating 1 GB
+    Binary fakeLarge = Binary.fromConstantByteArray(new byte[0], 0, 1 << 30);
+    stats.updateStats(fakeLarge);
+
+    // min.length() + max.length() = 2^31, must not overflow int to negative
+    assertFalse(stats.isSmallerThan(4096));
+  }
 }
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderTest.java b/parquet-column/src/test/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderTest.java
index 348d245595..dc37ed67ca 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderTest.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/bytestreamsplit/ByteStreamSplitValuesReaderTest.java
@@ -37,7 +37,7 @@ private static  Reader makeReader(byte[] input, int
       throws Exception {
     ByteBuffer buffer = ByteBuffer.wrap(input);
     ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);
-    Reader reader = cls.newInstance();
+    Reader reader = cls.getDeclaredConstructor().newInstance();
     reader.initFromPage(length, stream);
     return reader;
   }
diff --git a/parquet-column/src/test/java/org/apache/parquet/column/values/rle/TestRunLengthBitPackingHybridEncoder.java b/parquet-column/src/test/java/org/apache/parquet/column/values/rle/TestRunLengthBitPackingHybridEncoder.java
index 7ec5a9568c..93a6c8deb4 100644
--- a/parquet-column/src/test/java/org/apache/parquet/column/values/rle/TestRunLengthBitPackingHybridEncoder.java
+++ b/parquet-column/src/test/java/org/apache/parquet/column/values/rle/TestRunLengthBitPackingHybridEncoder.java
@@ -22,7 +22,6 @@
 
 import java.io.ByteArrayInputStream;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import org.apache.parquet.bytes.BytesUtils;
 import org.apache.parquet.bytes.DirectByteBufferAllocator;
@@ -187,7 +186,7 @@ public void testTransitionFromBitPackingToRle() throws Exception {
     assertEquals(3, BytesUtils.readUnsignedVarInt(is));
 
     List values = unpack(3, 8, is);
-    assertEquals(Arrays.asList(0, 1, 0, 1, 0, 2, 2, 2), values);
+    assertEquals(List.of(0, 1, 0, 1, 0, 2, 2, 2), values);
 
     // header = 100 << 1 = 200
     assertEquals(200, BytesUtils.readUnsignedVarInt(is));
@@ -212,7 +211,7 @@ public void testPaddingZerosOnUnfinishedBitPackedRuns() throws Exception {
 
     List values = unpack(5, 16, is);
 
-    assertEquals(Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0), values);
+    assertEquals(List.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, 0), values);
 
     assertEquals(-1, is.read());
   }
diff --git a/parquet-column/src/test/java/org/apache/parquet/filter2/recordlevel/TestValueInspector.java b/parquet-column/src/test/java/org/apache/parquet/filter2/recordlevel/TestValueInspector.java
index a164e96665..e0c55c2764 100644
--- a/parquet-column/src/test/java/org/apache/parquet/filter2/recordlevel/TestValueInspector.java
+++ b/parquet-column/src/test/java/org/apache/parquet/filter2/recordlevel/TestValueInspector.java
@@ -24,7 +24,6 @@
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-import java.util.Arrays;
 import java.util.List;
 import org.apache.parquet.filter2.recordlevel.IncrementallyUpdatedFilterPredicate.ValueInspector;
 import org.junit.Test;
@@ -83,7 +82,7 @@ public void testLifeCycle() {
 
   @Test
   public void testReusable() {
-    List values = Arrays.asList(2, 4, 7, 3, 8, 8, 11, 200);
+    List values = List.of(2, 4, 7, 3, 8, 8, 11, 200);
     ValueInspector v = intIsEven();
 
     for (Integer x : values) {
diff --git a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
index 58a899eefc..6274f36263 100644
--- a/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
+++ b/parquet-column/src/test/java/org/apache/parquet/internal/column/columnindex/TestColumnIndexBuilder.java
@@ -18,7 +18,6 @@
  */
 package org.apache.parquet.internal.column.columnindex;
 
-import static java.util.Arrays.asList;
 import static org.apache.parquet.filter2.predicate.FilterApi.and;
 import static org.apache.parquet.filter2.predicate.FilterApi.binaryColumn;
 import static org.apache.parquet.filter2.predicate.FilterApi.booleanColumn;
@@ -688,8 +687,8 @@ public void testStaticBuildBinary() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(BINARY).as(UTF8).named("test_binary_utf8"),
         BoundaryOrder.ASCENDING,
-        asList(true, true, false, false, true, false, true, false),
-        asList(1l, 2l, 3l, 4l, 5l, 6l, 7l, 8l),
+        List.of(true, true, false, false, true, false, true, false),
+        List.of(1l, 2l, 3l, 4l, 5l, 6l, 7l, 8l),
         toBBList(
             null,
             null,
@@ -738,7 +737,7 @@ public void testFilterWithoutNullCounts() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(BINARY).as(UTF8).named("test_binary_utf8"),
         BoundaryOrder.ASCENDING,
-        asList(true, true, false, false, true, false, true, false),
+        List.of(true, true, false, false, true, false, true, false),
         null,
         toBBList(
             null,
@@ -904,8 +903,8 @@ public void testStaticBuildBoolean() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(BOOLEAN).named("test_boolean"),
         BoundaryOrder.DESCENDING,
-        asList(false, true, false, true, false, true),
-        asList(9l, 8l, 7l, 6l, 5l, 0l),
+        List.of(false, true, false, true, false, true),
+        List.of(9l, 8l, 7l, 6l, 5l, 0l),
         toBBList(false, null, false, null, true, null),
         toBBList(true, null, false, null, true, null));
     assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
@@ -1058,8 +1057,8 @@ public void testStaticBuildDouble() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(DOUBLE).named("test_double"),
         BoundaryOrder.UNORDERED,
-        asList(false, false, false, false, false, false),
-        asList(0l, 1l, 2l, 3l, 4l, 5l),
+        List.of(false, false, false, false, false, false),
+        List.of(0l, 1l, 2l, 3l, 4l, 5l),
         toBBList(-1.0, -2.0, -3.0, -4.0, -5.0, -6.0),
         toBBList(1.0, 2.0, 3.0, 4.0, 5.0, 6.0));
     assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
@@ -1211,8 +1210,8 @@ public void testStaticBuildFloat() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(FLOAT).named("test_float"),
         BoundaryOrder.ASCENDING,
-        asList(true, true, true, false, false, false),
-        asList(9l, 8l, 7l, 6l, 0l, 0l),
+        List.of(true, true, true, false, false, false),
+        List.of(9l, 8l, 7l, 6l, 0l, 0l),
         toBBList(null, null, null, -3.0f, -2.0f, 0.1f),
         toBBList(null, null, null, -2.0f, 0.0f, 6.0f));
     assertEquals(BoundaryOrder.ASCENDING, columnIndex.getBoundaryOrder());
@@ -1345,8 +1344,8 @@ public void testStaticBuildInt32() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(INT32).named("test_int32"),
         BoundaryOrder.DESCENDING,
-        asList(false, false, false, true, true, true),
-        asList(0l, 10l, 0l, 3l, 5l, 7l),
+        List.of(false, false, false, true, true, true),
+        List.of(0l, 10l, 0l, 3l, 5l, 7l),
         toBBList(10, 8, 6, null, null, null),
         toBBList(9, 7, 5, null, null, null));
     assertEquals(BoundaryOrder.DESCENDING, columnIndex.getBoundaryOrder());
@@ -1597,8 +1596,8 @@ public void testStaticBuildInt64() {
     ColumnIndex columnIndex = ColumnIndexBuilder.build(
         Types.required(INT64).named("test_int64"),
         BoundaryOrder.UNORDERED,
-        asList(true, false, true, false, true, false),
-        asList(1l, 2l, 3l, 4l, 5l, 6l),
+        List.of(true, false, true, false, true, false),
+        List.of(1l, 2l, 3l, 4l, 5l, 6l),
         toBBList(null, 2l, null, 4l, null, 9l),
         toBBList(null, 3l, null, 15l, null, 10l));
     assertEquals(BoundaryOrder.UNORDERED, columnIndex.getBoundaryOrder());
diff --git a/parquet-column/src/test/java/org/apache/parquet/io/ExpectationValidatingConverter.java b/parquet-column/src/test/java/org/apache/parquet/io/ExpectationValidatingConverter.java
index cf8e2edc09..99073c98ed 100644
--- a/parquet-column/src/test/java/org/apache/parquet/io/ExpectationValidatingConverter.java
+++ b/parquet-column/src/test/java/org/apache/parquet/io/ExpectationValidatingConverter.java
@@ -21,7 +21,6 @@
 import static org.junit.Assert.assertEquals;
 
 import java.util.ArrayDeque;
-import java.util.Arrays;
 import java.util.Deque;
 import java.util.List;
 import org.apache.parquet.io.api.Binary;
@@ -48,7 +47,7 @@ public void validate(String got) {
   }
 
   public ExpectationValidatingConverter(String[] expectations, MessageType schema) {
-    this(new ArrayDeque<>(Arrays.asList(expectations)), schema);
+    this(new ArrayDeque<>(List.of(expectations)), schema);
   }
 
   public ExpectationValidatingConverter(Deque expectations, MessageType schema) {
diff --git a/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java b/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java
index ed1e232a8d..fa4fab710c 100644
--- a/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java
+++ b/parquet-column/src/test/java/org/apache/parquet/io/TestColumnIO.java
@@ -144,7 +144,7 @@ public class TestColumnIO {
   @Parameterized.Parameters
   public static Collection data() throws IOException {
     Object[][] data = {{true}, {false}};
-    return Arrays.asList(data);
+    return List.of(data);
   }
 
   private boolean useDictionary;
@@ -386,7 +386,7 @@ public void testOneOfEach() {
         .append("g", new NanoTime(1234, System.currentTimeMillis() * 1000))
         .append("h", Binary.fromString("abc"));
 
-    testSchema(oneOfEachSchema, Arrays.asList(g1));
+    testSchema(oneOfEachSchema, List.of(g1));
   }
 
   @Test
@@ -398,7 +398,7 @@ public void testRequiredOfRequired() {
     Group g1 = gf.newGroup();
     g1.addGroup("foo").append("bar", 2l);
 
-    testSchema(reqreqSchema, Arrays.asList(g1));
+    testSchema(reqreqSchema, List.of(g1));
   }
 
   @Test
diff --git a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java
index 3101ecea0d..b165b200d2 100644
--- a/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java
+++ b/parquet-column/src/test/java/org/apache/parquet/schema/TestPrimitiveStringifier.java
@@ -19,7 +19,6 @@
 package org.apache.parquet.schema;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
-import static java.util.Arrays.asList;
 import static java.util.concurrent.TimeUnit.HOURS;
 import static java.util.concurrent.TimeUnit.MICROSECONDS;
 import static java.util.concurrent.TimeUnit.MILLISECONDS;
@@ -48,6 +47,7 @@
 import java.nio.ByteBuffer;
 import java.util.Calendar;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 import java.util.TimeZone;
 import java.util.concurrent.TimeUnit;
@@ -175,7 +175,7 @@ public void testDateStringifier() {
   @Test
   public void testTimestampMillisStringifier() {
     for (PrimitiveStringifier stringifier :
-        asList(TIMESTAMP_MILLIS_STRINGIFIER, TIMESTAMP_MILLIS_UTC_STRINGIFIER)) {
+        List.of(TIMESTAMP_MILLIS_STRINGIFIER, TIMESTAMP_MILLIS_UTC_STRINGIFIER)) {
       String timezoneAmendment = (stringifier == TIMESTAMP_MILLIS_STRINGIFIER ? "" : "+0000");
 
       assertEquals(withZoneString("1970-01-01T00:00:00.000", timezoneAmendment), stringifier.stringify(0l));
@@ -202,7 +202,7 @@ public void testTimestampMillisStringifier() {
   @Test
   public void testTimestampMicrosStringifier() {
     for (PrimitiveStringifier stringifier :
-        asList(TIMESTAMP_MICROS_STRINGIFIER, TIMESTAMP_MICROS_UTC_STRINGIFIER)) {
+        List.of(TIMESTAMP_MICROS_STRINGIFIER, TIMESTAMP_MICROS_UTC_STRINGIFIER)) {
       String timezoneAmendment = (stringifier == TIMESTAMP_MICROS_STRINGIFIER ? "" : "+0000");
 
       assertEquals(withZoneString("1970-01-01T00:00:00.000000", timezoneAmendment), stringifier.stringify(0l));
@@ -228,7 +228,7 @@ public void testTimestampMicrosStringifier() {
 
   @Test
   public void testTimestampNanosStringifier() {
-    for (PrimitiveStringifier stringifier : asList(TIMESTAMP_NANOS_STRINGIFIER, TIMESTAMP_NANOS_UTC_STRINGIFIER)) {
+    for (PrimitiveStringifier stringifier : List.of(TIMESTAMP_NANOS_STRINGIFIER, TIMESTAMP_NANOS_UTC_STRINGIFIER)) {
       String timezoneAmendment = (stringifier == TIMESTAMP_NANOS_STRINGIFIER ? "" : "+0000");
 
       assertEquals(withZoneString("1970-01-01T00:00:00.000000000", timezoneAmendment), stringifier.stringify(0l));
@@ -254,7 +254,7 @@ public void testTimestampNanosStringifier() {
 
   @Test
   public void testTimeStringifier() {
-    for (PrimitiveStringifier stringifier : asList(TIME_STRINGIFIER, TIME_UTC_STRINGIFIER)) {
+    for (PrimitiveStringifier stringifier : List.of(TIME_STRINGIFIER, TIME_UTC_STRINGIFIER)) {
       String timezoneAmendment = (stringifier == TIME_STRINGIFIER ? "" : "+0000");
 
       assertEquals(withZoneString("00:00:00.000", timezoneAmendment), stringifier.stringify(0));
@@ -290,7 +290,7 @@ public void testTimeStringifier() {
 
   @Test
   public void testTimeNanoStringifier() {
-    for (PrimitiveStringifier stringifier : asList(TIME_NANOS_STRINGIFIER, TIME_NANOS_UTC_STRINGIFIER)) {
+    for (PrimitiveStringifier stringifier : List.of(TIME_NANOS_STRINGIFIER, TIME_NANOS_UTC_STRINGIFIER)) {
       String timezoneAmendment = (stringifier == TIME_NANOS_STRINGIFIER ? "" : "+0000");
 
       assertEquals(withZoneString("00:00:00.000000000", timezoneAmendment), stringifier.stringify(0l));
@@ -434,7 +434,7 @@ private Binary toBinary(int... bytes) {
   }
 
   private void checkThrowingUnsupportedException(PrimitiveStringifier stringifier, Class... excludes) {
-    Set> set = new HashSet<>(asList(excludes));
+    Set> set = new HashSet<>(List.of(excludes));
     if (!set.contains(Integer.TYPE)) {
       try {
         stringifier.stringify(0);
diff --git a/parquet-common/pom.xml b/parquet-common/pom.xml
index 08a3b1188b..037e65799b 100644
--- a/parquet-common/pom.xml
+++ b/parquet-common/pom.xml
@@ -21,7 +21,7 @@
     org.apache.parquet
     parquet
     ../pom.xml
-    1.17.0-SNAPSHOT
+    1.18.0-SNAPSHOT
   
 
   4.0.0
@@ -64,7 +64,7 @@
 
     
       org.mockito
-      mockito-all
+      mockito-core
       ${mockito.version}
       test
     
diff --git a/parquet-common/src/test/java/org/apache/parquet/SemanticVersionTest.java b/parquet-common/src/test/java/org/apache/parquet/SemanticVersionTest.java
index 602ea374cb..e29352d402 100644
--- a/parquet-common/src/test/java/org/apache/parquet/SemanticVersionTest.java
+++ b/parquet-common/src/test/java/org/apache/parquet/SemanticVersionTest.java
@@ -21,7 +21,6 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
-import java.util.Arrays;
 import java.util.List;
 import org.junit.Test;
 
@@ -51,7 +50,7 @@ public void testCompare() {
 
   @Test
   public void testSemverPrereleaseExamples() throws Exception {
-    List examples = Arrays.asList(
+    List examples = List.of(
         "1.0.0-alpha",
         "1.0.0-alpha.1",
         "1.0.0-alpha.beta",
diff --git a/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java b/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java
index 6ffe3c650a..38d4b79219 100644
--- a/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java
+++ b/parquet-common/src/test/java/org/apache/parquet/bytes/TestBytesInput.java
@@ -21,8 +21,8 @@
 import static org.junit.Assert.assertArrayEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.fail;
-import static org.mockito.Matchers.anyInt;
-import static org.mockito.Matchers.anyObject;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.ArgumentMatchers.anyInt;
 import static org.mockito.Mockito.never;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
@@ -401,6 +401,6 @@ private void validateToByteBufferIsInternal(Supplier factory) {
     Consumer callbackMock = Mockito.mock(Consumer.class);
     factory.get().toByteBuffer(allocatorMock, callbackMock);
     verify(allocatorMock, never()).allocate(anyInt());
-    verify(callbackMock, never()).accept(anyObject());
+    verify(callbackMock, never()).accept(any());
   }
 }
diff --git a/parquet-common/src/test/java/org/apache/parquet/bytes/TestConcatenatingByteBufferCollector.java b/parquet-common/src/test/java/org/apache/parquet/bytes/TestConcatenatingByteBufferCollector.java
index 8b3a9cabaf..d973a7c96a 100644
--- a/parquet-common/src/test/java/org/apache/parquet/bytes/TestConcatenatingByteBufferCollector.java
+++ b/parquet-common/src/test/java/org/apache/parquet/bytes/TestConcatenatingByteBufferCollector.java
@@ -24,7 +24,7 @@
 import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
-import java.util.Arrays;
+import java.util.List;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -54,7 +54,7 @@ public void test() throws IOException {
         ConcatenatingByteBufferCollector inner = new ConcatenatingByteBufferCollector(allocator)) {
       outer.collect(BytesInput.concat(
           BytesInput.from(byteBuffer("This"), byteBuffer(" "), byteBuffer("is")),
-          BytesInput.from(Arrays.asList(byteBuffer(" a"), byteBuffer(" "), byteBuffer("test"))),
+          BytesInput.from(List.of(byteBuffer(" a"), byteBuffer(" "), byteBuffer("test"))),
           BytesInput.from(inputStream(" text to blabla"), 8),
           BytesInput.from(bytes(" ")),
           BytesInput.from(bytes("blabla validate blabla"), 7, 9),
diff --git a/parquet-common/src/test/java/org/apache/parquet/bytes/TestMultiBufferInputStream.java b/parquet-common/src/test/java/org/apache/parquet/bytes/TestMultiBufferInputStream.java
index a6ddfe794d..2c1b1ff52a 100644
--- a/parquet-common/src/test/java/org/apache/parquet/bytes/TestMultiBufferInputStream.java
+++ b/parquet-common/src/test/java/org/apache/parquet/bytes/TestMultiBufferInputStream.java
@@ -21,13 +21,12 @@
 
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 import org.junit.Assert;
 import org.junit.Test;
 
 public class TestMultiBufferInputStream extends TestByteBufferInputStreams {
-  private static final List DATA = Arrays.asList(
+  private static final List DATA = List.of(
       ByteBuffer.wrap(new byte[] {0, 1, 2, 3, 4, 5, 6, 7, 8}),
       ByteBuffer.wrap(new byte[] {9, 10, 11, 12}),
       ByteBuffer.wrap(new byte[] {}),
diff --git a/parquet-common/src/test/java/org/apache/parquet/glob/TestGlob.java b/parquet-common/src/test/java/org/apache/parquet/glob/TestGlob.java
index 34bf7f4182..433cc360f6 100644
--- a/parquet-common/src/test/java/org/apache/parquet/glob/TestGlob.java
+++ b/parquet-common/src/test/java/org/apache/parquet/glob/TestGlob.java
@@ -21,7 +21,7 @@
 import static junit.framework.Assert.fail;
 import static org.junit.Assert.assertEquals;
 
-import java.util.Arrays;
+import java.util.List;
 import junit.framework.Assert;
 import org.apache.parquet.Strings;
 import org.apache.parquet.glob.GlobParser.GlobParseException;
@@ -31,28 +31,28 @@ public class TestGlob {
 
   @Test
   public void testNoGlobs() {
-    assertEquals(Arrays.asList("foo"), Strings.expandGlob("foo"));
+    assertEquals(List.of("foo"), Strings.expandGlob("foo"));
   }
 
   @Test
   public void testEmptyGroup() {
-    assertEquals(Arrays.asList(""), Strings.expandGlob(""));
-    assertEquals(Arrays.asList(""), Strings.expandGlob("{}"));
-    assertEquals(Arrays.asList("a"), Strings.expandGlob("a{}"));
-    assertEquals(Arrays.asList("ab"), Strings.expandGlob("a{}b"));
-    assertEquals(Arrays.asList("a"), Strings.expandGlob("{}a"));
-    assertEquals(Arrays.asList("a"), Strings.expandGlob("a{}"));
-    assertEquals(Arrays.asList("", ""), Strings.expandGlob("{,}"));
-    assertEquals(Arrays.asList("ab", "a", "ac"), Strings.expandGlob("a{b,{},c}"));
+    assertEquals(List.of(""), Strings.expandGlob(""));
+    assertEquals(List.of(""), Strings.expandGlob("{}"));
+    assertEquals(List.of("a"), Strings.expandGlob("a{}"));
+    assertEquals(List.of("ab"), Strings.expandGlob("a{}b"));
+    assertEquals(List.of("a"), Strings.expandGlob("{}a"));
+    assertEquals(List.of("a"), Strings.expandGlob("a{}"));
+    assertEquals(List.of("", ""), Strings.expandGlob("{,}"));
+    assertEquals(List.of("ab", "a", "ac"), Strings.expandGlob("a{b,{},c}"));
   }
 
   @Test
   public void testSingleLevel() {
-    assertEquals(Arrays.asList("foobar", "foobaz"), Strings.expandGlob("foo{bar,baz}"));
-    assertEquals(Arrays.asList("startfooend", "startbarend"), Strings.expandGlob("start{foo,bar}end"));
-    assertEquals(Arrays.asList("fooend", "barend"), Strings.expandGlob("{foo,bar}end"));
+    assertEquals(List.of("foobar", "foobaz"), Strings.expandGlob("foo{bar,baz}"));
+    assertEquals(List.of("startfooend", "startbarend"), Strings.expandGlob("start{foo,bar}end"));
+    assertEquals(List.of("fooend", "barend"), Strings.expandGlob("{foo,bar}end"));
     assertEquals(
-        Arrays.asList(
+        List.of(
             "startfooenda",
             "startfooendb",
             "startfooendc",
@@ -62,14 +62,14 @@ public void testSingleLevel() {
             "startbarendc",
             "startbarendd"),
         Strings.expandGlob("start{foo,bar}end{a,b,c,d}"));
-    assertEquals(Arrays.asList("xa", "xb", "xc", "ya", "yb", "yc"), Strings.expandGlob("{x,y}{a,b,c}"));
-    assertEquals(Arrays.asList("x", "y", "z"), Strings.expandGlob("{x,y,z}"));
+    assertEquals(List.of("xa", "xb", "xc", "ya", "yb", "yc"), Strings.expandGlob("{x,y}{a,b,c}"));
+    assertEquals(List.of("x", "y", "z"), Strings.expandGlob("{x,y,z}"));
   }
 
   @Test
   public void testNested() {
     assertEquals(
-        Arrays.asList(
+        List.of(
             "startoneend",
             "startpretwopostend",
             "startprethreepostend",
@@ -84,9 +84,9 @@ public void testNested() {
 
   @Test
   public void testExtraBraces() {
-    assertEquals(Arrays.asList("x", "y", "z"), Strings.expandGlob("{{x,y,z}}"));
-    assertEquals(Arrays.asList("x", "y", "z"), Strings.expandGlob("{{{x,y,z}}}"));
-    assertEquals(Arrays.asList("startx", "starta", "startb", "starty"), Strings.expandGlob("start{x,{a,b},y}"));
+    assertEquals(List.of("x", "y", "z"), Strings.expandGlob("{{x,y,z}}"));
+    assertEquals(List.of("x", "y", "z"), Strings.expandGlob("{{{x,y,z}}}"));
+    assertEquals(List.of("startx", "starta", "startb", "starty"), Strings.expandGlob("start{x,{a,b},y}"));
   }
 
   @Test
@@ -102,17 +102,17 @@ public void testCommaInTopLevel() {
   @Test
   public void testCommaCornerCases() {
     // single empty string in each location
-    assertEquals(Arrays.asList("foobar", "foo", "foobaz"), Strings.expandGlob("foo{bar,,baz}"));
-    assertEquals(Arrays.asList("foo", "foobar", "foobaz"), Strings.expandGlob("foo{,bar,baz}"));
-    assertEquals(Arrays.asList("foobar", "foobaz", "foo"), Strings.expandGlob("foo{bar,baz,}"));
+    assertEquals(List.of("foobar", "foo", "foobaz"), Strings.expandGlob("foo{bar,,baz}"));
+    assertEquals(List.of("foo", "foobar", "foobaz"), Strings.expandGlob("foo{,bar,baz}"));
+    assertEquals(List.of("foobar", "foobaz", "foo"), Strings.expandGlob("foo{bar,baz,}"));
 
     // multiple empty strings
-    assertEquals(Arrays.asList("foobar", "foo", "foo", "foobaz"), Strings.expandGlob("foo{bar,,,baz}"));
-    assertEquals(Arrays.asList("foo", "foo", "foobar", "foobaz"), Strings.expandGlob("foo{,,bar,baz}"));
-    assertEquals(Arrays.asList("foobar", "foobaz", "foo", "foo"), Strings.expandGlob("foo{bar,baz,,}"));
+    assertEquals(List.of("foobar", "foo", "foo", "foobaz"), Strings.expandGlob("foo{bar,,,baz}"));
+    assertEquals(List.of("foo", "foo", "foobar", "foobaz"), Strings.expandGlob("foo{,,bar,baz}"));
+    assertEquals(List.of("foobar", "foobaz", "foo", "foo"), Strings.expandGlob("foo{bar,baz,,}"));
 
     // between groups
-    assertEquals(Arrays.asList("x", "y", "", "a", "b"), Strings.expandGlob("{{x,y},,{a,b}}"));
+    assertEquals(List.of("x", "y", "", "a", "b"), Strings.expandGlob("{{x,y},,{a,b}}"));
   }
 
   private void assertNotEnoughCloseBraces(String s) {
diff --git a/parquet-encoding/pom.xml b/parquet-encoding/pom.xml
index 8fef044823..7605bcad74 100644
--- a/parquet-encoding/pom.xml
+++ b/parquet-encoding/pom.xml
@@ -21,7 +21,7 @@
     org.apache.parquet
     parquet
     ../pom.xml
-    1.17.0-SNAPSHOT
+    1.18.0-SNAPSHOT
   
 
   4.0.0
diff --git a/parquet-format-structures/pom.xml b/parquet-format-structures/pom.xml
index 5818f1dfdf..7ff9c6a113 100644
--- a/parquet-format-structures/pom.xml
+++ b/parquet-format-structures/pom.xml
@@ -24,7 +24,7 @@
     org.apache.parquet
     parquet
     ../pom.xml
-    1.17.0-SNAPSHOT
+    1.18.0-SNAPSHOT
   
 
   parquet-format-structures
@@ -133,6 +133,25 @@
           true
         
       
+      
+        org.codehaus.mojo
+        build-helper-maven-plugin
+        3.6.0
+        
+          
+            add-sources
+            generate-sources
+            
+              add-source
+            
+            
+              
+                ${project.build.directory}/generated-sources/thrift
+              
+            
+          
+        
+      
     
   
 
diff --git a/parquet-generator/pom.xml b/parquet-generator/pom.xml
index b4f52428ba..959538601c 100644
--- a/parquet-generator/pom.xml
+++ b/parquet-generator/pom.xml
@@ -21,7 +21,7 @@
     org.apache.parquet
     parquet
     ../pom.xml
-    1.17.0-SNAPSHOT
+    1.18.0-SNAPSHOT
   
 
   4.0.0
diff --git a/parquet-hadoop-bundle/pom.xml b/parquet-hadoop-bundle/pom.xml
index 78689cd350..e43c4b61fe 100644
--- a/parquet-hadoop-bundle/pom.xml
+++ b/parquet-hadoop-bundle/pom.xml
@@ -21,7 +21,7 @@
     org.apache.parquet
     parquet
     ../pom.xml
-    1.17.0-SNAPSHOT
+    1.18.0-SNAPSHOT
   
 
   4.0.0
diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml
index 5d133a2be4..b31e78cb66 100644
--- a/parquet-hadoop/pom.xml
+++ b/parquet-hadoop/pom.xml
@@ -21,7 +21,7 @@
     org.apache.parquet
     parquet
     ../pom.xml
-    1.17.0-SNAPSHOT
+    1.18.0-SNAPSHOT
   
 
   4.0.0
@@ -121,7 +121,7 @@
     
       ${jackson.groupId}
       jackson-annotations
-      ${jackson.version}
+      ${jackson-annotations.version}
     
     
       ${jackson.groupId}
@@ -131,7 +131,7 @@
     
       org.xerial.snappy
       snappy-java
-      1.1.10.7
+      1.1.10.8
       jar
       compile
     
@@ -144,7 +144,7 @@
     
       io.airlift
       aircompressor
-      2.0.2
+      2.0.3
     
     
       commons-pool
@@ -170,7 +170,7 @@
     
     
       org.mockito
-      mockito-all
+      mockito-core
       ${mockito.version}
       test
     
@@ -191,6 +191,12 @@
       slf4j-api
       ${slf4j.version}
     
+    
+      org.hamcrest
+      hamcrest-core
+      1.3
+      test
+    
     
       commons-io
       commons-io
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
index 002028cdf5..3597898c30 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java
@@ -703,11 +703,13 @@ private void addRowGroup(
     rowGroups.add(rowGroup);
   }
 
-  private List toFormatEncodings(Set encodings) {
+  // Visible for testing
+  List toFormatEncodings(Set encodings) {
     List converted = new ArrayList(encodings.size());
     for (org.apache.parquet.column.Encoding encoding : encodings) {
       converted.add(getEncoding(encoding));
     }
+    Collections.sort(converted);
     return converted;
   }
 
@@ -1052,12 +1054,12 @@ enum SortOrder {
     UNKNOWN
   }
 
-  private static final Set STRING_TYPES = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
+  private static final Set STRING_TYPES = Set.of(
       LogicalTypeAnnotation.StringLogicalTypeAnnotation.class,
       LogicalTypeAnnotation.EnumLogicalTypeAnnotation.class,
       LogicalTypeAnnotation.JsonLogicalTypeAnnotation.class,
       LogicalTypeAnnotation.Float16LogicalTypeAnnotation.class,
-      LogicalTypeAnnotation.UnknownLogicalTypeAnnotation.class)));
+      LogicalTypeAnnotation.UnknownLogicalTypeAnnotation.class);
 
   /**
    * Returns whether to use signed order min and max with a type. It is safe to
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java
index c9842c9375..19b1d5426e 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordReader.java
@@ -294,6 +294,14 @@ private static  Map> toSetMultiMap(Map map) {
     return Collections.unmodifiableMap(setMultiMap);
   }
 
+  /**
+   * Returns the 0-based index of the row group currently being read. Returns -1 if no row group
+   * has been read yet.
+   */
+  public int getCurrentRowGroupIndex() {
+    return currentBlock;
+  }
+
   /**
    * Returns the row index of the current row. If no row has been processed or if the
    * row index information is unavailable from the underlying @{@link PageReadStore}, returns -1.
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
index 41b068d01a..dd51d1ef09 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/InternalParquetRecordWriter.java
@@ -152,6 +152,9 @@ public void close() throws IOException, InterruptedException {
   }
 
   public void write(T value) throws IOException, InterruptedException {
+    if (aborted) {
+      throw new IOException("Writer has been aborted due to a previous error and cannot accept further writes");
+    }
     try {
       writeSupport.write(value);
       ++recordCount;
@@ -171,7 +174,9 @@ public long getDataSize() {
 
   private void checkBlockSizeReached() throws IOException {
     if (recordCount >= rowGroupRecordCountThreshold) {
-      LOG.debug("record count reaches threshold: flushing {} records to disk.", recordCount);
+      if (LOG.isDebugEnabled()) {
+        LOG.debug("record count reaches threshold: flushing {} records to disk.", recordCount);
+      }
       flushRowGroupToStore();
       initStore();
       recordCountForNextMemCheck = min(
@@ -185,7 +190,9 @@ private void checkBlockSizeReached() throws IOException {
       // flush the row group if it is within ~2 records of the limit
       // it is much better to be slightly under size than to be over at all
       if (memSize > (nextRowGroupSize - 2 * recordSize)) {
-        LOG.debug("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount);
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("mem size {} > {}: flushing {} records to disk.", memSize, nextRowGroupSize, recordCount);
+        }
         flushRowGroupToStore();
         initStore();
         recordCountForNextMemCheck = min(
@@ -201,7 +208,9 @@ private void checkBlockSizeReached() throws IOException {
             recordCount
                 + props.getMaxRowCountForPageSizeCheck() // will not look more than max records ahead
             );
-        LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck);
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("Checked mem at {} will check again at: {}", recordCount, recordCountForNextMemCheck);
+        }
       }
     }
   }
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
index 551b1bf6c7..e0b0d76e0e 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java
@@ -1097,6 +1097,14 @@ public List getRowGroups() {
     return blocks;
   }
 
+  /**
+   * Returns the 0-based index of the row group that was last read via {@link #readNextRowGroup()}
+   * or {@link #readNextFilteredRowGroup()}. Returns -1 if no row group has been read yet.
+   */
+  public int getCurrentRowGroupIndex() {
+    return currentBlock - 1;
+  }
+
   public void setRequestedSchema(List columns) {
     paths.clear();
     for (ColumnDescriptor col : columns) {
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java
index 4514a829c5..01ac69b330 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetReader.java
@@ -144,6 +144,17 @@ public T read() throws IOException {
     }
   }
 
+  /**
+   * @return the 0-based index of the row group currently being read. If no row group has been
+   *     read yet, returns -1.
+   */
+  public int getCurrentRowGroupIndex() {
+    if (reader == null) {
+      return -1;
+    }
+    return reader.getCurrentRowGroupIndex();
+  }
+
   /**
    * @return the row index of the last read row. If no row has been processed, returns -1.
    */
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java
index b217116aac..c0e52fc5c6 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetRecordReader.java
@@ -207,6 +207,14 @@ public boolean nextKeyValue() throws IOException, InterruptedException {
     return internalReader.nextKeyValue();
   }
 
+  /**
+   * @return the 0-based index of the row group currently being read. If no row group has been
+   *     read yet, returns -1.
+   */
+  public int getCurrentRowGroupIndex() {
+    return internalReader.getCurrentRowGroupIndex();
+  }
+
   /**
    * @return the row index of the current row. If no row has been processed, returns -1.
    */
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/PrintFooter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/PrintFooter.java
index 64153893d7..d50bccd03b 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/PrintFooter.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/PrintFooter.java
@@ -23,7 +23,6 @@
 
 import java.net.URI;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Collection;
 import java.util.Deque;
 import java.util.LinkedHashMap;
@@ -77,7 +76,7 @@ public static void main(String[] args) throws Exception {
       List statuses;
       if (fileStatus.isDir()) {
         System.out.println("listing files in " + fileStatus.getPath());
-        statuses = Arrays.asList(fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE));
+        statuses = List.of(fs.listStatus(fileStatus.getPath(), HiddenFileFilter.INSTANCE));
       } else {
         statuses = new ArrayList();
         statuses.add(fileStatus);
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/mapred/DeprecatedParquetInputFormat.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/mapred/DeprecatedParquetInputFormat.java
index f10c574c41..f3e822528c 100644
--- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/mapred/DeprecatedParquetInputFormat.java
+++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/mapred/DeprecatedParquetInputFormat.java
@@ -19,7 +19,6 @@
 package org.apache.parquet.hadoop.mapred;
 
 import static java.lang.Boolean.TRUE;
-import static java.util.Arrays.asList;
 
 import java.io.DataInput;
 import java.io.DataOutput;
@@ -65,7 +64,7 @@ public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
   }
 
   public List