From a0955619dc32f9c5218793c008e87d1a1dc7407f Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 2 Apr 2023 22:33:10 +0800 Subject: [PATCH 01/18] Update CHANGES.md for 1.13.0 release --- CHANGES.md | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 7785db5486..6138485868 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,6 +19,129 @@ # Parquet # +### Version 1.13.0 ### + +Release Notes - Parquet - Version 1.13.0 + +#### New Feature + +* [PARQUET-1020](https://issues.apache.org/jira/browse/PARQUET-1020) - Add support for Dynamic Messages in parquet-protobuf + +#### Task + +* [PARQUET-2230](https://issues.apache.org/jira/browse/PARQUET-2230) - Add a new rewrite command powered by ParquetRewriter +* [PARQUET-2228](https://issues.apache.org/jira/browse/PARQUET-2228) - ParquetRewriter supports more than one input file +* [PARQUET-2229](https://issues.apache.org/jira/browse/PARQUET-2229) - ParquetRewriter supports masking and encrypting the same column +* [PARQUET-2227](https://issues.apache.org/jira/browse/PARQUET-2227) - Refactor different file rewriters to use single implementation + +#### Improvement + +* [PARQUET-2258](https://issues.apache.org/jira/browse/PARQUET-2258) - Storing toString fields in FilterPredicate instances can lead to memory pressure +* [PARQUET-2252](https://issues.apache.org/jira/browse/PARQUET-2252) - Make some methods public to allow external projects to implement page skipping +* [PARQUET-2159](https://issues.apache.org/jira/browse/PARQUET-2159) - Vectorized BytePacker decoder using Java VectorAPI +* [PARQUET-2246](https://issues.apache.org/jira/browse/PARQUET-2246) - Add short circuit logic to column index filter +* [PARQUET-2226](https://issues.apache.org/jira/browse/PARQUET-2226) - Support merge Bloom Filters +* [PARQUET-2224](https://issues.apache.org/jira/browse/PARQUET-2224) - Publish SBOM artifacts +* [PARQUET-2208](https://issues.apache.org/jira/browse/PARQUET-2208) - Add details to nested column encryption config doc and exception text +* [PARQUET-2195](https://issues.apache.org/jira/browse/PARQUET-2195) - Add scan command to parquet-cli +* [PARQUET-2196](https://issues.apache.org/jira/browse/PARQUET-2196) - Support LZ4_RAW codec +* [PARQUET-2176](https://issues.apache.org/jira/browse/PARQUET-2176) - Column index/statistics truncation in ParquetWriter +* [PARQUET-2197](https://issues.apache.org/jira/browse/PARQUET-2197) - Document uniform encryption +* [PARQUET-2191](https://issues.apache.org/jira/browse/PARQUET-2191) - Upgrade Scala to 2.12.17 +* [PARQUET-2169](https://issues.apache.org/jira/browse/PARQUET-2169) - Upgrade Avro to version 1.11.1 +* [PARQUET-2155](https://issues.apache.org/jira/browse/PARQUET-2155) - Upgrade protobuf version to 3.17.3 +* [PARQUET-2158](https://issues.apache.org/jira/browse/PARQUET-2158) - Upgrade Hadoop dependency to version 3.2.0 +* [PARQUET-2138](https://issues.apache.org/jira/browse/PARQUET-2138) - Add ShowBloomFilterCommand to parquet-cli +* [PARQUET-2157](https://issues.apache.org/jira/browse/PARQUET-2157) - Add BloomFilter fpp config + +#### Bug + +* [PARQUET-2202](https://issues.apache.org/jira/browse/PARQUET-2202) - Redundant String allocation on the hot path in CapacityByteArrayOutputStream.setByte +* [PARQUET-2164](https://issues.apache.org/jira/browse/PARQUET-2164) - CapacityByteArrayOutputStream overflow while writing causes negative row group sizes to be written +* [PARQUET-2103](https://issues.apache.org/jira/browse/PARQUET-2103) - Fix crypto exception in print toPrettyJSON +* [PARQUET-2251](https://issues.apache.org/jira/browse/PARQUET-2251) - Avoid generating Bloomfilter when all pages of a column are encoded by dictionary +* [PARQUET-2243](https://issues.apache.org/jira/browse/PARQUET-2243) - Support zstd-jni in DirectCodecFactory +* [PARQUET-2247](https://issues.apache.org/jira/browse/PARQUET-2247) - Fail-fast if CapacityByteArrayOutputStream write overflow +* [PARQUET-2241](https://issues.apache.org/jira/browse/PARQUET-2241) - Fix ByteStreamSplitValuesReader with nulls +* [PARQUET-2244](https://issues.apache.org/jira/browse/PARQUET-2244) - Fix notIn for columns with null values +* [PARQUET-2173](https://issues.apache.org/jira/browse/PARQUET-2173) - Fix parquet build against hadoop 3.3.3+ +* [PARQUET-2219](https://issues.apache.org/jira/browse/PARQUET-2219) - ParquetFileReader skips empty row group +* [PARQUET-2198](https://issues.apache.org/jira/browse/PARQUET-2198) - Updating jackson data bind version to fix CVEs +* [PARQUET-2177](https://issues.apache.org/jira/browse/PARQUET-2177) - Fix parquet-cli not to fail showing descriptions +* [PARQUET-1711](https://issues.apache.org/jira/browse/PARQUET-1711) - Support recursive proto schemas by limiting recursion depth +* [PARQUET-2142](https://issues.apache.org/jira/browse/PARQUET-2142) - parquet-cli without hadoop throws java.lang.NoSuchMethodError on any parquet file access command +* [PARQUET-2160](https://issues.apache.org/jira/browse/PARQUET-2160) - Close decompression stream to free off-heap memory in time +* [PARQUET-2185](https://issues.apache.org/jira/browse/PARQUET-2185) - ParquetReader constructed using builder fails to read encrypted files +* [PARQUET-2167](https://issues.apache.org/jira/browse/PARQUET-2167) - CLI show footer command fails if Parquet file contains date fields +* [PARQUET-2134](https://issues.apache.org/jira/browse/PARQUET-2134) - Incorrect type checking in HadoopStreams.wrap +* [PARQUET-2161](https://issues.apache.org/jira/browse/PARQUET-2161) - Fix row index generation in combination with range filtering +* [PARQUET-2154](https://issues.apache.org/jira/browse/PARQUET-2154) - ParquetFileReader should close its input stream when filterRowGroups throw Exception in constructor + +#### Test + +* [PARQUET-2192](https://issues.apache.org/jira/browse/PARQUET-2192) - Add Java 17 build test to GitHub action + +### Version 1.12.3 ### + +Release Notes - Parquet - Version 1.12.3 + +#### New Feature + +* [PARQUET-2117](https://issues.apache.org/jira/browse/PARQUET-2117) - Add rowPosition API in parquet record readers + +#### Task + +* [PARQUET-2081](https://issues.apache.org/jira/browse/PARQUET-2081) - Encryption translation tool - Parquet-hadoop + +#### Improvement + +* [PARQUET-2040](https://issues.apache.org/jira/browse/PARQUET-2040) - Uniform encryption +* [PARQUET-2076](https://issues.apache.org/jira/browse/PARQUET-2076) - Improve Travis CI build Performance +* [PARQUET-2105](https://issues.apache.org/jira/browse/PARQUET-2105) - Refactor the test code of creating the test file +* [PARQUET-2106](https://issues.apache.org/jira/browse/PARQUET-2106) - BinaryComparator should avoid doing ByteBuffer.wrap in the hot-path +* [PARQUET-2112](https://issues.apache.org/jira/browse/PARQUET-2112) - Fix typo in MessageColumnIO +* [PARQUET-2121](https://issues.apache.org/jira/browse/PARQUET-2121) - Remove descriptions for the removed modules +* [PARQUET-2127](https://issues.apache.org/jira/browse/PARQUET-2127) - Security risk in latest parquet-jackson-1.12.2.jar +* [PARQUET-2128](https://issues.apache.org/jira/browse/PARQUET-2128) - Bump Thrift to 0.16.0 +* [PARQUET-2129](https://issues.apache.org/jira/browse/PARQUET-2129) - Add uncompressedSize to "meta" output +* [PARQUET-2136](https://issues.apache.org/jira/browse/PARQUET-2136) - File writer construction with encryptor + +#### Bug + +* [PARQUET-2101](https://issues.apache.org/jira/browse/PARQUET-2101) - Fix wrong descriptions about the default block size +* [PARQUET-2102](https://issues.apache.org/jira/browse/PARQUET-2102) - Typo in ColumnIndexBase toString +* [PARQUET-2107](https://issues.apache.org/jira/browse/PARQUET-2107) - Travis failures +* [PARQUET-2120](https://issues.apache.org/jira/browse/PARQUET-2120) - parquet-cli dictionary command fails on pages without dictionary encoding +* [PARQUET-2144](https://issues.apache.org/jira/browse/PARQUET-2144) - Fix ColumnIndexBuilder for notIn predicate +* [PARQUET-2148](https://issues.apache.org/jira/browse/PARQUET-2148) - Enable uniform decryption with plaintext footer + +### Version 1.12.2 ### + +Release Notes - Parquet - Version 1.12.2 + +#### Bug + +* [PARQUET-2094](https://issues.apache.org/jira/browse/PARQUET-2094) - Handle negative values in page headers + +### Version 1.12.1 ### + +Release Notes - Parquet - Version 1.12.1 + +#### Bug + +* [PARQUET-1633](https://issues.apache.org/jira/browse/PARQUET-1633) - Fix integer overflow +* [PARQUET-2022](https://issues.apache.org/jira/browse/PARQUET-2022) - ZstdDecompressorStream should close zstdInputStream +* [PARQUET-2027](https://issues.apache.org/jira/browse/PARQUET-2027) - Fix calculating directory offset for merge +* [PARQUET-2052](https://issues.apache.org/jira/browse/PARQUET-2052) - Integer overflow when writing huge binary using dictionary encoding +* [PARQUET-2054](https://issues.apache.org/jira/browse/PARQUET-2054) - fix TCP leaking when calling ParquetFileWriter.appendFile +* [PARQUET-2072](https://issues.apache.org/jira/browse/PARQUET-2072) - Do Not Determine Both Min/Max for Binary Stats +* [PARQUET-2073](https://issues.apache.org/jira/browse/PARQUET-2073) - Fix estimate remaining row count in ColumnWriteStoreBase. +* [PARQUET-2078](https://issues.apache.org/jira/browse/PARQUET-2078) - Failed to read parquet file after writing with the same parquet version + +#### Improvement + +* [PARQUET-2064](https://issues.apache.org/jira/browse/PARQUET-2064) - Make Range public accessible in RowRanges + ### Version 1.12.0 ### Release Notes - Parquet - Version 1.12.0 From 2e369ed173f66f057c296e63c1bc31d77f294f41 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 2 Apr 2023 23:54:36 +0800 Subject: [PATCH 02/18] [maven-release-plugin] prepare release apache-parquet-1.13.0-rc0 --- parquet-arrow/pom.xml | 2 +- parquet-avro/pom.xml | 2 +- parquet-benchmarks/pom.xml | 2 +- parquet-cli/pom.xml | 2 +- parquet-column/pom.xml | 2 +- parquet-common/pom.xml | 2 +- parquet-encoding/pom.xml | 2 +- parquet-format-structures/pom.xml | 2 +- parquet-generator/pom.xml | 2 +- parquet-hadoop-bundle/pom.xml | 2 +- parquet-hadoop/pom.xml | 2 +- parquet-jackson/pom.xml | 2 +- parquet-pig-bundle/pom.xml | 2 +- parquet-pig/pom.xml | 2 +- parquet-protobuf/pom.xml | 2 +- parquet-scala/pom.xml | 2 +- parquet-thrift/pom.xml | 2 +- pom.xml | 4 ++-- 18 files changed, 19 insertions(+), 19 deletions(-) diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml index 81d6e7ab49..d996d2335e 100644 --- a/parquet-arrow/pom.xml +++ b/parquet-arrow/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml index c4d12e484f..72827f0670 100644 --- a/parquet-avro/pom.xml +++ b/parquet-avro/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml index 673ff86c6a..104bcc9523 100644 --- a/parquet-benchmarks/pom.xml +++ b/parquet-benchmarks/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-cli/pom.xml b/parquet-cli/pom.xml index f819793482..0f958f611e 100644 --- a/parquet-cli/pom.xml +++ b/parquet-cli/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index 664a6be141..541d52d6cc 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-common/pom.xml b/parquet-common/pom.xml index 1a0f2f9f5e..e7dcc99af4 100644 --- a/parquet-common/pom.xml +++ b/parquet-common/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-encoding/pom.xml b/parquet-encoding/pom.xml index 2b27c19eaa..9d20941739 100644 --- a/parquet-encoding/pom.xml +++ b/parquet-encoding/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-format-structures/pom.xml b/parquet-format-structures/pom.xml index ce72ed5353..fae8a8a461 100644 --- a/parquet-format-structures/pom.xml +++ b/parquet-format-structures/pom.xml @@ -24,7 +24,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 parquet-format-structures diff --git a/parquet-generator/pom.xml b/parquet-generator/pom.xml index a39370da19..faf8362c41 100644 --- a/parquet-generator/pom.xml +++ b/parquet-generator/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-hadoop-bundle/pom.xml b/parquet-hadoop-bundle/pom.xml index d15792f241..c644df6012 100644 --- a/parquet-hadoop-bundle/pom.xml +++ b/parquet-hadoop-bundle/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index ce476a15f2..af0d26d6c1 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-jackson/pom.xml b/parquet-jackson/pom.xml index 8121832813..c9669159a8 100644 --- a/parquet-jackson/pom.xml +++ b/parquet-jackson/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-pig-bundle/pom.xml b/parquet-pig-bundle/pom.xml index 63661d497e..cd3457ed3a 100644 --- a/parquet-pig-bundle/pom.xml +++ b/parquet-pig-bundle/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-pig/pom.xml b/parquet-pig/pom.xml index 87f37333d3..eadf4e974e 100644 --- a/parquet-pig/pom.xml +++ b/parquet-pig/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml index 2b8dbc3ba9..358da9c151 100644 --- a/parquet-protobuf/pom.xml +++ b/parquet-protobuf/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-scala/pom.xml b/parquet-scala/pom.xml index 4420cfe2c0..db5c6e810f 100644 --- a/parquet-scala/pom.xml +++ b/parquet-scala/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/parquet-thrift/pom.xml b/parquet-thrift/pom.xml index e30b5d43a3..bc606a5744 100644 --- a/parquet-thrift/pom.xml +++ b/parquet-thrift/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.0 4.0.0 diff --git a/pom.xml b/pom.xml index 9dd515122e..ec877505c0 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.apache.parquet parquet - 1.13.0-SNAPSHOT + 1.13.0 pom Apache Parquet MR @@ -20,7 +20,7 @@ scm:git:git@github.com:apache/parquet-mr.git scm:git:git@github.com:apache/parquet-mr.git scm:git:git@github.com:apache/parquet-mr.git - HEAD + apache-parquet-1.13.0-rc0 From a52ce55f4538431bc23c515b2b733729191fda03 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 2 Apr 2023 23:54:47 +0800 Subject: [PATCH 03/18] [maven-release-plugin] prepare for next development iteration --- parquet-arrow/pom.xml | 2 +- parquet-avro/pom.xml | 2 +- parquet-benchmarks/pom.xml | 2 +- parquet-cli/pom.xml | 2 +- parquet-column/pom.xml | 2 +- parquet-common/pom.xml | 2 +- parquet-encoding/pom.xml | 2 +- parquet-format-structures/pom.xml | 2 +- parquet-generator/pom.xml | 2 +- parquet-hadoop-bundle/pom.xml | 2 +- parquet-hadoop/pom.xml | 2 +- parquet-jackson/pom.xml | 2 +- parquet-pig-bundle/pom.xml | 2 +- parquet-pig/pom.xml | 2 +- parquet-protobuf/pom.xml | 2 +- parquet-scala/pom.xml | 2 +- parquet-thrift/pom.xml | 2 +- pom.xml | 4 ++-- 18 files changed, 19 insertions(+), 19 deletions(-) diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml index d996d2335e..81d6e7ab49 100644 --- a/parquet-arrow/pom.xml +++ b/parquet-arrow/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml index 72827f0670..c4d12e484f 100644 --- a/parquet-avro/pom.xml +++ b/parquet-avro/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml index 104bcc9523..673ff86c6a 100644 --- a/parquet-benchmarks/pom.xml +++ b/parquet-benchmarks/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-cli/pom.xml b/parquet-cli/pom.xml index 0f958f611e..f819793482 100644 --- a/parquet-cli/pom.xml +++ b/parquet-cli/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index 541d52d6cc..664a6be141 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-common/pom.xml b/parquet-common/pom.xml index e7dcc99af4..1a0f2f9f5e 100644 --- a/parquet-common/pom.xml +++ b/parquet-common/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-encoding/pom.xml b/parquet-encoding/pom.xml index 9d20941739..2b27c19eaa 100644 --- a/parquet-encoding/pom.xml +++ b/parquet-encoding/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-format-structures/pom.xml b/parquet-format-structures/pom.xml index fae8a8a461..ce72ed5353 100644 --- a/parquet-format-structures/pom.xml +++ b/parquet-format-structures/pom.xml @@ -24,7 +24,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT parquet-format-structures diff --git a/parquet-generator/pom.xml b/parquet-generator/pom.xml index faf8362c41..a39370da19 100644 --- a/parquet-generator/pom.xml +++ b/parquet-generator/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-hadoop-bundle/pom.xml b/parquet-hadoop-bundle/pom.xml index c644df6012..d15792f241 100644 --- a/parquet-hadoop-bundle/pom.xml +++ b/parquet-hadoop-bundle/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index af0d26d6c1..ce476a15f2 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-jackson/pom.xml b/parquet-jackson/pom.xml index c9669159a8..8121832813 100644 --- a/parquet-jackson/pom.xml +++ b/parquet-jackson/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-pig-bundle/pom.xml b/parquet-pig-bundle/pom.xml index cd3457ed3a..63661d497e 100644 --- a/parquet-pig-bundle/pom.xml +++ b/parquet-pig-bundle/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-pig/pom.xml b/parquet-pig/pom.xml index eadf4e974e..87f37333d3 100644 --- a/parquet-pig/pom.xml +++ b/parquet-pig/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml index 358da9c151..2b8dbc3ba9 100644 --- a/parquet-protobuf/pom.xml +++ b/parquet-protobuf/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-scala/pom.xml b/parquet-scala/pom.xml index db5c6e810f..4420cfe2c0 100644 --- a/parquet-scala/pom.xml +++ b/parquet-scala/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/parquet-thrift/pom.xml b/parquet-thrift/pom.xml index bc606a5744..e30b5d43a3 100644 --- a/parquet-thrift/pom.xml +++ b/parquet-thrift/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0 + 1.13.0-SNAPSHOT 4.0.0 diff --git a/pom.xml b/pom.xml index ec877505c0..9dd515122e 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.apache.parquet parquet - 1.13.0 + 1.13.0-SNAPSHOT pom Apache Parquet MR @@ -20,7 +20,7 @@ scm:git:git@github.com:apache/parquet-mr.git scm:git:git@github.com:apache/parquet-mr.git scm:git:git@github.com:apache/parquet-mr.git - apache-parquet-1.13.0-rc0 + HEAD From 54b4501a490af19c91f362e95d52881b6658ad50 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Thu, 6 Apr 2023 09:21:11 +0800 Subject: [PATCH 04/18] Prepare for next development iteration --- parquet-arrow/pom.xml | 2 +- parquet-avro/pom.xml | 2 +- parquet-benchmarks/pom.xml | 2 +- parquet-cli/pom.xml | 2 +- parquet-column/pom.xml | 2 +- parquet-common/pom.xml | 2 +- parquet-encoding/pom.xml | 2 +- parquet-format-structures/pom.xml | 2 +- parquet-generator/pom.xml | 2 +- parquet-hadoop-bundle/pom.xml | 2 +- parquet-hadoop/pom.xml | 2 +- parquet-jackson/pom.xml | 2 +- parquet-pig-bundle/pom.xml | 2 +- parquet-pig/pom.xml | 2 +- parquet-protobuf/pom.xml | 2 +- parquet-scala/pom.xml | 2 +- parquet-thrift/pom.xml | 2 +- pom.xml | 4 ++-- 18 files changed, 19 insertions(+), 19 deletions(-) diff --git a/parquet-arrow/pom.xml b/parquet-arrow/pom.xml index 81d6e7ab49..4f6835d10b 100644 --- a/parquet-arrow/pom.xml +++ b/parquet-arrow/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-avro/pom.xml b/parquet-avro/pom.xml index c4d12e484f..52a6f07069 100644 --- a/parquet-avro/pom.xml +++ b/parquet-avro/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-benchmarks/pom.xml b/parquet-benchmarks/pom.xml index 673ff86c6a..5a285a38e7 100644 --- a/parquet-benchmarks/pom.xml +++ b/parquet-benchmarks/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-cli/pom.xml b/parquet-cli/pom.xml index f819793482..fcc4993f10 100644 --- a/parquet-cli/pom.xml +++ b/parquet-cli/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-column/pom.xml b/parquet-column/pom.xml index 664a6be141..cf71411675 100644 --- a/parquet-column/pom.xml +++ b/parquet-column/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-common/pom.xml b/parquet-common/pom.xml index 1a0f2f9f5e..a655f27a9b 100644 --- a/parquet-common/pom.xml +++ b/parquet-common/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-encoding/pom.xml b/parquet-encoding/pom.xml index 2b27c19eaa..c714c63608 100644 --- a/parquet-encoding/pom.xml +++ b/parquet-encoding/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-format-structures/pom.xml b/parquet-format-structures/pom.xml index ce72ed5353..c589b80d84 100644 --- a/parquet-format-structures/pom.xml +++ b/parquet-format-structures/pom.xml @@ -24,7 +24,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT parquet-format-structures diff --git a/parquet-generator/pom.xml b/parquet-generator/pom.xml index a39370da19..14bb4f4d6c 100644 --- a/parquet-generator/pom.xml +++ b/parquet-generator/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-hadoop-bundle/pom.xml b/parquet-hadoop-bundle/pom.xml index d15792f241..94264d995f 100644 --- a/parquet-hadoop-bundle/pom.xml +++ b/parquet-hadoop-bundle/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-hadoop/pom.xml b/parquet-hadoop/pom.xml index ce476a15f2..06688ed81c 100644 --- a/parquet-hadoop/pom.xml +++ b/parquet-hadoop/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-jackson/pom.xml b/parquet-jackson/pom.xml index 8121832813..4445439dd1 100644 --- a/parquet-jackson/pom.xml +++ b/parquet-jackson/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-pig-bundle/pom.xml b/parquet-pig-bundle/pom.xml index 63661d497e..2b9cbd8f8c 100644 --- a/parquet-pig-bundle/pom.xml +++ b/parquet-pig-bundle/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-pig/pom.xml b/parquet-pig/pom.xml index 87f37333d3..b7058ffbc9 100644 --- a/parquet-pig/pom.xml +++ b/parquet-pig/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-protobuf/pom.xml b/parquet-protobuf/pom.xml index 2b8dbc3ba9..ee3c813280 100644 --- a/parquet-protobuf/pom.xml +++ b/parquet-protobuf/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-scala/pom.xml b/parquet-scala/pom.xml index 4420cfe2c0..44900a5e78 100644 --- a/parquet-scala/pom.xml +++ b/parquet-scala/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/parquet-thrift/pom.xml b/parquet-thrift/pom.xml index e30b5d43a3..3e7925992c 100644 --- a/parquet-thrift/pom.xml +++ b/parquet-thrift/pom.xml @@ -21,7 +21,7 @@ org.apache.parquet parquet ../pom.xml - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT 4.0.0 diff --git a/pom.xml b/pom.xml index 9dd515122e..b49b0ce2b8 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,7 @@ org.apache.parquet parquet - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT pom Apache Parquet MR @@ -79,7 +79,7 @@ shaded.parquet 3.2.3 2.9.0 - 1.12.0 + 1.13.0 thrift ${thrift.executable} 2.12.17 From 30a42c3624ceae11aac293d2c59656200f5ffb74 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 16 Apr 2023 10:46:40 +0800 Subject: [PATCH 05/18] MINOR: update version of disabled module (#1066) --- parquet-plugins/parquet-encoding-vector/pom.xml | 2 +- parquet-plugins/parquet-plugins-benchmarks/pom.xml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/parquet-plugins/parquet-encoding-vector/pom.xml b/parquet-plugins/parquet-encoding-vector/pom.xml index 298c7e1ae4..7b3561e146 100644 --- a/parquet-plugins/parquet-encoding-vector/pom.xml +++ b/parquet-plugins/parquet-encoding-vector/pom.xml @@ -22,7 +22,7 @@ org.apache.parquet parquet - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT ../../pom.xml diff --git a/parquet-plugins/parquet-plugins-benchmarks/pom.xml b/parquet-plugins/parquet-plugins-benchmarks/pom.xml index fc898815a2..f140c65ace 100644 --- a/parquet-plugins/parquet-plugins-benchmarks/pom.xml +++ b/parquet-plugins/parquet-plugins-benchmarks/pom.xml @@ -22,7 +22,7 @@ org.apache.parquet parquet - 1.13.0-SNAPSHOT + 1.13.1-SNAPSHOT ../../pom.xml From fad89ee39f6a29fb001a433a2f2006a00e39ce8e Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sun, 16 Apr 2023 19:41:53 +0800 Subject: [PATCH 06/18] PARQUET-2081: Fix support for rewriting files without ColumnIndexes (#1048) (#1058) Fix for failure when rewriting ColumnChunks that do not have a ColumnIndex populated Co-authored-by: Richard Kerr --- .../parquet/hadoop/ParquetFileWriter.java | 2 +- .../hadoop/rewrite/ParquetRewriterTest.java | 85 +++++++++++++++++-- 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index 3e5c718ba0..9cd7f13819 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -1127,7 +1127,7 @@ public void appendColumnChunk(ColumnDescriptor descriptor, SeekableInputStream f long length = chunk.getTotalSize(); long newChunkStart = out.getPos(); - if (newChunkStart != start) { + if (offsetIndex != null && newChunkStart != start) { offsetIndex = OffsetIndexBuilder.getBuilder() .fromOffsetIndex(offsetIndex) .build(newChunkStart - start); diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/rewrite/ParquetRewriterTest.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/rewrite/ParquetRewriterTest.java index 043261f77f..bc8d451994 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/rewrite/ParquetRewriterTest.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/rewrite/ParquetRewriterTest.java @@ -19,10 +19,12 @@ package org.apache.parquet.hadoop.rewrite; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.parquet.HadoopReadOptions; import org.apache.parquet.ParquetReadOptions; +import org.apache.parquet.Version; import org.apache.parquet.column.ParquetProperties; import org.apache.parquet.crypto.FileDecryptionProperties; import org.apache.parquet.crypto.FileEncryptionProperties; @@ -49,12 +51,14 @@ import org.apache.parquet.internal.column.columnindex.ColumnIndex; import org.apache.parquet.internal.column.columnindex.OffsetIndex; import org.apache.parquet.io.InputFile; +import org.apache.parquet.io.InvalidRecordException; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.schema.GroupType; import org.apache.parquet.schema.InvalidSchemaException; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; import org.apache.parquet.schema.Type; +import org.junit.Before; import org.junit.Test; import java.io.IOException; @@ -66,6 +70,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; @@ -132,6 +137,11 @@ private void testPruneSingleColumnTranslateCodec(List inputPaths) throws E validateCreatedBy(); } + @Before + public void setUp() { + outputFile = TestFileBuilder.createTempFile("test"); + } + @Test public void testPruneSingleColumnTranslateCodecSingleFile() throws Exception { testSingleInputFileSetup("GZIP"); @@ -296,6 +306,70 @@ public void testPruneEncryptTranslateCodecTwoFiles() throws Exception { testPruneEncryptTranslateCodec(inputPaths); } + @Test + public void testRewriteWithoutColumnIndexes() throws Exception { + List inputPaths = new ArrayList() {{ + add(new Path(ParquetRewriterTest.class.getResource("/test-file-with-no-column-indexes-1.parquet").toURI())); + }}; + + inputFiles = inputPaths.stream().map(p -> new EncryptionTestFile(p.toString(), null)).collect(Collectors.toList()); + + Path outputPath = new Path(outputFile); + RewriteOptions.Builder builder = new RewriteOptions.Builder(conf, inputPaths, outputPath); + + Map maskCols = Maps.newHashMap(); + maskCols.put("location.lat", MaskMode.NULLIFY); + maskCols.put("location.lon", MaskMode.NULLIFY); + maskCols.put("location", MaskMode.NULLIFY); + + List pruneCols = Lists.newArrayList("phoneNumbers"); + + RewriteOptions options = builder.mask(maskCols).prune(pruneCols).build(); + rewriter = new ParquetRewriter(options); + rewriter.processBlocks(); + rewriter.close(); + + // Verify the schema are not changed for the columns not pruned + ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER); + MessageType schema = pmd.getFileMetaData().getSchema(); + List fields = schema.getFields(); + assertEquals(fields.size(), 3); + assertEquals(fields.get(0).getName(), "id"); + assertEquals(fields.get(1).getName(), "name"); + assertEquals(fields.get(2).getName(), "location"); + List subFields = fields.get(2).asGroupType().getFields(); + assertEquals(subFields.size(), 2); + assertEquals(subFields.get(0).getName(), "lon"); + assertEquals(subFields.get(1).getName(), "lat"); + + try(ParquetReader outReader = ParquetReader.builder(new GroupReadSupport(), new Path(outputFile)).withConf(conf).build(); + ParquetReader inReader = ParquetReader.builder(new GroupReadSupport(), inputPaths.get(0)).withConf(conf).build(); + ) { + + for(Group inRead = inReader.read(), outRead = outReader.read(); + inRead != null || outRead != null; + inRead = inReader.read(), outRead = outReader.read()) { + assertNotNull(inRead); + assertNotNull(outRead); + + assertEquals(inRead.getLong("id", 0), outRead.getLong("id", 0)); + assertEquals(inRead.getString("name", 0), outRead.getString("name", 0)); + + // location was nulled + Group finalOutRead = outRead; + assertThrows(RuntimeException.class, () -> finalOutRead.getGroup("location", 0).getDouble("lat", 0)); + assertThrows(RuntimeException.class, () -> finalOutRead.getGroup("location", 0).getDouble("lon", 0)); + + // phonenumbers was pruned + assertThrows(InvalidRecordException.class, () -> finalOutRead.getGroup("phoneNumbers", 0)); + + } + } + + // Verify original.created.by is preserved + validateCreatedBy(); + } + private void testNullifyAndEncryptColumn(List inputPaths) throws Exception { Map maskColumns = new HashMap<>(); maskColumns.put("DocId", MaskMode.NULLIFY); @@ -436,7 +510,6 @@ public void testMergeTwoFilesWithDifferentSchema() throws Exception { .withCodec("UNCOMPRESSED") .withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE) .build()); - outputFile = TestFileBuilder.createTempFile("test"); List inputPaths = new ArrayList<>(); for (EncryptionTestFile inputFile : inputFiles) { @@ -458,7 +531,6 @@ private void testSingleInputFileSetup(String compression) throws IOException { .withCodec(compression) .withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE) .build()); - outputFile = TestFileBuilder.createTempFile("test"); } private void testMultipleInputFilesSetup() throws IOException { @@ -474,7 +546,7 @@ private void testMultipleInputFilesSetup() throws IOException { .withCodec("UNCOMPRESSED") .withPageSize(ParquetProperties.DEFAULT_PAGE_SIZE) .build()); - outputFile = TestFileBuilder.createTempFile("test"); + } private MessageType createSchema() { @@ -686,10 +758,13 @@ private void validateCreatedBy() throws Exception { // Verify created_by has been set FileMetaData outFMD = getFileMetaData(outputFile, null).getFileMetaData(); - String inputCreatedBy = (String) inputCreatedBys[0]; - assertEquals(inputCreatedBy, outFMD.getCreatedBy()); + final String createdBy = outFMD.getCreatedBy(); + assertNotNull(createdBy); + assertEquals(createdBy, Version.FULL_VERSION); + // Verify original.created.by has been set + String inputCreatedBy = (String) inputCreatedBys[0]; String originalCreatedBy = outFMD.getKeyValueMetaData().get(ParquetRewriter.ORIGINAL_CREATED_BY_KEY); assertEquals(inputCreatedBy, originalCreatedBy); } From 4f7ced5f34b1705ce9e20a3ee9bb6635f2dbd5c7 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Wed, 19 Apr 2023 11:10:30 +0200 Subject: [PATCH 07/18] PARQUET-2283: Remove Hadoop HiddenFileFilter (#1072) (#1073) For Iceberg/Flink we would like to run without the hadoop dependencies. The use of the HiddenFileFilter is blocking this. This replaces the filter with a nice stream. --- .../parquet/hadoop/ParquetFileReader.java | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java index 7fa71cb618..b50149cdb5 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileReader.java @@ -53,6 +53,8 @@ import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; +import java.util.stream.Collectors; +import java.util.stream.Stream; import java.util.zip.CRC32; import org.apache.hadoop.conf.Configuration; @@ -99,7 +101,6 @@ import org.apache.parquet.hadoop.metadata.FileMetaData; import org.apache.parquet.hadoop.metadata.ParquetMetadata; import org.apache.parquet.hadoop.util.HadoopInputFile; -import org.apache.parquet.hadoop.util.HiddenFileFilter; import org.apache.parquet.hadoop.util.counters.BenchmarkCounter; import org.apache.parquet.internal.column.columnindex.ColumnIndex; import org.apache.parquet.internal.column.columnindex.OffsetIndex; @@ -374,17 +375,25 @@ public static List