@@ -1973,6 +1973,60 @@ cdef vector[CSortingColumn] _convert_sorting_columns(sorting_columns) except *:
19731973
19741974 return c_sorting_columns
19751975
1976+ cdef void _set_bloom_opts_for_column(
1977+ WriterProperties.Builder* props,
1978+ column,
1979+ column_bloom_opts) except * :
1980+ """ Set Bloom filter options for a single column"""
1981+ cdef:
1982+ BloomFilterOptions bloom_opts
1983+
1984+ if isinstance (column_bloom_opts, dict ):
1985+ if " ndv" in column_bloom_opts:
1986+ ndv = column_bloom_opts[" ndv" ]
1987+ if isinstance (ndv, int ):
1988+ if ndv <= 0 :
1989+ raise ValueError (
1990+ f" 'bloom_filter_options:ndv' for column '{column}' must be greater than zero, got {ndv}" )
1991+ bloom_opts.ndv = ndv
1992+ else :
1993+ raise TypeError (
1994+ f" 'bloom_filter_options:ndv' for column '{column}' must be an int" )
1995+ if " fpp" in column_bloom_opts:
1996+ fpp = column_bloom_opts[" fpp" ]
1997+ if isinstance (fpp, float ):
1998+ if fpp <= 0.0 or fpp >= 1.0 :
1999+ raise ValueError (
2000+ f" 'bloom_filter_options:fpp' for column '{column}' must be in (0.0, 1.0), got {fpp}" )
2001+ bloom_opts.fpp = fpp
2002+ else :
2003+ raise TypeError (
2004+ f" 'bloom_filter_options:fpp' for column '{column}' must be a float" )
2005+ elif isinstance (column_bloom_opts, bool ):
2006+ # if True then use the defaults set above, if False then disable
2007+ if not column_bloom_opts:
2008+ props.disable_bloom_filter(tobytes(column))
2009+ return
2010+ else :
2011+ raise TypeError (
2012+ f" 'bloom_filter_options:{column}' must be a boolean or a dictionary" )
2013+
2014+ props.enable_bloom_filter(tobytes(column), bloom_opts)
2015+
2016+
2017+ cdef void _set_bloom_filter_opts(
2018+ WriterProperties.Builder* props,
2019+ bloom_filter_options) except * :
2020+ """ Set Bloom filter options for all columns"""
2021+ if bloom_filter_options is not None :
2022+ if isinstance (bloom_filter_options, dict ):
2023+ # for each entry in bloom_filter_options, {"path": {"ndv": ndv, "fpp", fpp}}
2024+ # convert (ndv,fpp) to BloomFilterOptions struct and pass to props
2025+ for column, _bloom_opts in bloom_filter_options.items():
2026+ _set_bloom_opts_for_column(props, column, _bloom_opts)
2027+ else :
2028+ raise TypeError (" 'bloom_filter_options' must be a dictionary" )
2029+
19762030
19772031cdef shared_ptr[WriterProperties] _create_writer_properties(
19782032 use_dictionary = None ,
@@ -1992,7 +2046,8 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
19922046 write_page_checksum = False ,
19932047 sorting_columns = None ,
19942048 store_decimal_as_integer = False ,
1995- use_content_defined_chunking = False ) except * :
2049+ use_content_defined_chunking = False ,
2050+ bloom_filter_options = None ) except * :
19962051
19972052 """ General writer properties"""
19982053 cdef:
@@ -2122,6 +2177,9 @@ cdef shared_ptr[WriterProperties] _create_writer_properties(
21222177 raise TypeError (
21232178 " 'column_encoding' should be a dictionary or a string" )
21242179
2180+ # bloom filters
2181+ _set_bloom_filter_opts(& props, bloom_filter_options)
2182+
21252183 # size limits
21262184 if data_page_size is not None :
21272185 props.data_pagesize(data_page_size)
@@ -2317,7 +2375,8 @@ cdef class ParquetWriter(_Weakrefable):
23172375 sorting_columns = None ,
23182376 store_decimal_as_integer = False ,
23192377 use_content_defined_chunking = False ,
2320- write_time_adjusted_to_utc = False ):
2378+ write_time_adjusted_to_utc = False ,
2379+ bloom_filter_options = None ):
23212380 cdef:
23222381 shared_ptr[WriterProperties] properties
23232382 shared_ptr[ArrowWriterProperties] arrow_properties
@@ -2353,7 +2412,8 @@ cdef class ParquetWriter(_Weakrefable):
23532412 write_page_checksum = write_page_checksum,
23542413 sorting_columns = sorting_columns,
23552414 store_decimal_as_integer = store_decimal_as_integer,
2356- use_content_defined_chunking = use_content_defined_chunking
2415+ use_content_defined_chunking = use_content_defined_chunking,
2416+ bloom_filter_options = bloom_filter_options
23572417 )
23582418 arrow_properties = _create_arrow_writer_properties(
23592419 use_deprecated_int96_timestamps = use_deprecated_int96_timestamps,
0 commit comments