-
Notifications
You must be signed in to change notification settings - Fork 4.1k
Expand file tree
/
Copy patharrow_to_pandas.h
More file actions
165 lines (134 loc) · 6.07 KB
/
arrow_to_pandas.h
File metadata and controls
165 lines (134 loc) · 6.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// Functions for converting between pandas's NumPy-based data representation
// and Arrow data structures
#pragma once
#include "arrow/python/platform.h"
#include <memory>
#include <string>
#include <unordered_set>
#include "arrow/memory_pool.h"
#include "arrow/python/visibility.h"
namespace arrow {
class Array;
class ChunkedArray;
class Column;
class DataType;
class MemoryPool;
class Status;
class Table;
namespace py {
enum class MapConversionType {
DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas
LOSSY, // report warnings when lossiness is encountered due to duplicate keys
STRICT_, // raise a Python exception when lossiness is encountered due to duplicate
// keys
};
struct PandasOptions {
bool HasCategoricalColumns() const {
return categorical_columns && !categorical_columns->empty();
}
bool IsCategoricalColumn(const std::string& name) const {
return categorical_columns && categorical_columns->count(name);
}
bool HasExtensionColumns() const {
return extension_columns && !extension_columns->empty();
}
bool IsExtensionColumn(const std::string& name) const {
return extension_columns && extension_columns->count(name);
}
/// arrow::MemoryPool to use for memory allocations
MemoryPool* pool = default_memory_pool();
/// If true, we will convert all string columns to categoricals
bool strings_to_categorical = false;
bool zero_copy_only = false;
bool integer_object_nulls = false;
bool date_as_object = false;
bool timestamp_as_object = false;
bool use_threads = false;
/// Coerce all date and timestamp to datetime64[ns]
bool coerce_temporal_nanoseconds = false;
/// Used to maintain backwards compatibility for
/// timezone bugs (see ARROW-9528). Should be removed
/// after Arrow 2.0 release.
bool ignore_timezone = false;
/// \brief If true, do not create duplicate PyObject versions of equal
/// objects. This only applies to immutable objects like strings or datetime
/// objects
bool deduplicate_objects = false;
/// \brief For certain data types, a cast is needed in order to store the
/// data in a pandas DataFrame or Series (e.g. timestamps are always stored
/// as nanoseconds in pandas). This option controls whether it is a safe
/// cast or not.
bool safe_cast = true;
/// \brief If true, create one block per column rather than consolidated
/// blocks (1 per data type). Do zero-copy wrapping when there are no
/// nulls. pandas currently will consolidate the blocks on its own, causing
/// increased memory use, so keep this in mind if you are working on a
/// memory-constrained situation.
bool split_blocks = false;
/// \brief If true, allow non-writable zero-copy views to be created for
/// single column blocks. This option is also used to provide zero copy for
/// Series data
bool allow_zero_copy_blocks = false;
/// \brief If true, attempt to deallocate buffers in passed Arrow object if
/// it is the only remaining shared_ptr copy of it. See ARROW-3789 for
/// original context for this feature. Only currently implemented for Table
/// conversions
bool self_destruct = false;
/// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to
/// Python association lists (list-of-tuples) in the same order as the Arrow
/// Map, as in [(key1, value1), (key2, value2), ...]
/// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts.
/// This can change the ordering of (key, value) pairs, and will deduplicate
/// multiple keys, resulting in a possible loss of data.
/// If 'lossy', this key deduplication results in a warning printed
/// when detected. If 'strict', this instead results in an exception
/// being raised when detected.
MapConversionType maps_as_pydicts = MapConversionType::DEFAULT;
// Used internally for nested arrays.
bool decode_dictionaries = false;
// Columns that should be casted to categorical
//
// This is wrapped in a shared_ptr because this struct is copied internally for
// each column or nested field (see GH-47861).
std::shared_ptr<const std::unordered_set<std::string>> categorical_columns;
// Columns that should be passed through to be converted to
// ExtensionArray/Block
std::shared_ptr<const std::unordered_set<std::string>> extension_columns;
// Used internally to decipher between to_numpy() and to_pandas() when
// the expected output differs
bool to_numpy = false;
};
ARROW_PYTHON_EXPORT
Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr<Array> arr,
PyObject* py_ref, PyObject** out);
ARROW_PYTHON_EXPORT
Status ConvertChunkedArrayToPandas(const PandasOptions& options,
std::shared_ptr<ChunkedArray> col, PyObject* py_ref,
PyObject** out);
// Convert a whole table as efficiently as possible to a pandas.DataFrame.
//
// The returned Python object is a list of tuples consisting of the exact 2D
// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
//
// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
ARROW_PYTHON_EXPORT
Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr<Table> table,
PyObject** out);
} // namespace py
} // namespace arrow