Skip to content

Commit dc0bb44

Browse files
committed
Implement scan pandas object column
1 parent df0abf9 commit dc0bb44

File tree

11 files changed

+447
-13
lines changed

11 files changed

+447
-13
lines changed

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ pybind11_add_module(_kuzu
1414
src_cpp/py_prepared_statement.cpp
1515
src_cpp/py_query_result.cpp
1616
src_cpp/py_query_result_converter.cpp
17+
src_cpp/py_conversion.cpp
1718
src_cpp/pandas/pandas_bind.cpp
1819
src_cpp/pandas/pandas_scan.cpp
20+
src_cpp/pandas/pandas_analyzer.cpp
1921
src_cpp/numpy/numpy_type.cpp
2022
src_cpp/numpy/numpy_scan.cpp)
2123

src_cpp/include/numpy/numpy_scan.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ struct PandasColumnBindData;
1010
struct NumpyScan {
1111
static void scan(PandasColumnBindData* bindData, uint64_t count, uint64_t offset,
1212
common::ValueVector* outputVector);
13+
static void scanObjectColumn(
14+
PyObject** col, uint64_t count, uint64_t offset, common::ValueVector* outputVector);
1315
};
1416

1517
} // namespace kuzu
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#pragma once
2+
3+
#include "common/types/types.h"
4+
#include "pybind_include.h"
5+
6+
namespace kuzu {
7+
8+
struct PythonGILWrapper {
9+
py::gil_scoped_acquire acquire;
10+
};
11+
12+
class PandasAnalyzer {
13+
public:
14+
PandasAnalyzer() { analyzedType = common::LogicalType{common::LogicalTypeID::ANY}; }
15+
16+
public:
17+
common::LogicalType getListType(py::object& ele, bool& canConvert);
18+
common::LogicalType getItemType(py::object ele, bool& canConvert);
19+
bool analyze(py::object column);
20+
common::LogicalType getAnalyzedType() { return analyzedType; }
21+
22+
private:
23+
common::LogicalType innerAnalyze(py::object column, bool& canConvert);
24+
25+
private:
26+
PythonGILWrapper gil;
27+
common::LogicalType analyzedType;
28+
};
29+
30+
} // namespace kuzu

src_cpp/include/pandas/pandas_bind.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "numpy/numpy_type.h"
44
#include "pandas_column.h"
55
#include "pybind_include.h"
6+
#include "py_object_container.h"
67

78
namespace kuzu {
89

@@ -19,6 +20,7 @@ struct PandasColumnBindData {
1920
NumpyType npType;
2021
std::unique_ptr<PandasColumn> pandasCol;
2122
std::unique_ptr<RegisteredArray> mask;
23+
PythonObjectContainer objectStrValContainer;
2224

2325
PandasColumnBindData() = default;
2426

src_cpp/include/py_conversion.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#pragma once
2+
3+
#include <cstdint>
4+
5+
#include "common/exception/conversion.h"
6+
#include "common/types/value/value.h"
7+
#include "common/vector/value_vector.h"
8+
#include "pybind_include.h"
9+
#include <datetime.h>
10+
11+
namespace kuzu {
12+
13+
enum class PythonObjectType : uint8_t {
14+
None,
15+
Integer,
16+
Float,
17+
Bool,
18+
Datetime,
19+
Date,
20+
String,
21+
List,
22+
};
23+
24+
PythonObjectType getPythonObjectType(py::handle& ele);
25+
26+
void tryTransformPythonNumeric(common::ValueVector* outputVector, uint64_t pos, py::handle ele);
27+
28+
void transformListValue(common::ValueVector* outputVector, uint64_t pos, py::handle ele);
29+
30+
void transformPythonValue(common::ValueVector* outputVector, uint64_t pos, py::handle ele);
31+
32+
} // namespace kuzu
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#pragma once
2+
3+
#include <vector>
4+
#include "common/assert.h"
5+
6+
#include "pybind_include.h"
7+
8+
namespace kuzu {
9+
10+
class PythonObjectContainer {
11+
public:
12+
PythonObjectContainer() {}
13+
14+
~PythonObjectContainer() {
15+
py::gil_scoped_acquire acquire;
16+
pyObjects.clear();
17+
}
18+
19+
void push(py::object&& obj) {
20+
py::gil_scoped_acquire gil;
21+
PushInternal(std::move(obj));
22+
}
23+
24+
const py::object& getLastAddedObject() {
25+
KU_ASSERT(!pyObjects.empty());
26+
return pyObjects.back();
27+
}
28+
29+
private:
30+
void PushInternal(py::object&& obj) { pyObjects.emplace_back(obj); }
31+
32+
private:
33+
std::vector<py::object> pyObjects;
34+
};
35+
36+
} // namespace kuzu

src_cpp/numpy/numpy_scan.cpp

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#include "numpy/numpy_scan.h"
22

3+
#include "common/type_utils.h"
34
#include "common/types/timestamp_t.h"
45
#include "pandas/pandas_bind.h"
6+
#include "py_conversion.h"
57
#include "py_str_utils.h"
68
#include "utf8proc_wrapper.h"
79

@@ -142,15 +144,27 @@ void NumpyScan::scan(PandasColumnBindData* bindData, uint64_t count, uint64_t of
142144
case NumpyNullableType::OBJECT: {
143145
auto sourceData = (PyObject**)array.data();
144146
if (outputVector->dataType.getLogicalTypeID() != LogicalTypeID::STRING) {
145-
// LCOV_EXCL_START
146-
throw RuntimeException{"Scanning pandas generic object column is not supported."};
147-
// LCOV_EXCL_STOP
147+
scanObjectColumn(sourceData, count, offset, outputVector);
148+
return;
148149
}
149150
auto dstData = reinterpret_cast<ku_string_t*>(outputVector->getData());
150151
py::gil_scoped_acquire gil;
151152
for (auto i = 0u; i < count; i++) {
152153
auto pos = i + offset;
153154
PyObject* val = sourceData[pos];
155+
if (bindData->npType.type == NumpyNullableType::OBJECT &&
156+
!py::isinstance<py::str>(val)) {
157+
if (val == Py_None ||
158+
(py::isinstance<py::float_>(val) && std::isnan(PyFloat_AsDouble(val)))) {
159+
outputVector->setNull(pos, true /* isNull */);
160+
continue;
161+
}
162+
if (!py::isinstance<py::str>(val)) {
163+
bindData->objectStrValContainer.push(std::move(py::str(val)));
164+
val = reinterpret_cast<PyObject*>(
165+
bindData->objectStrValContainer.getLastAddedObject().ptr());
166+
}
167+
}
154168
py::handle strHandle(val);
155169
if (!py::isinstance<py::str>(strHandle)) {
156170
outputVector->setNull(i, true /* isNull */);
@@ -198,4 +212,22 @@ void NumpyScan::scan(PandasColumnBindData* bindData, uint64_t count, uint64_t of
198212
}
199213
}
200214

215+
void scanNumpyObject(PyObject* object, uint64_t offset, common::ValueVector* outputVector) {
216+
if (object == Py_None) {
217+
outputVector->setNull(offset, true /* isNull */);
218+
return;
219+
}
220+
outputVector->setNull(offset, false /* isNull */);
221+
transformPythonValue(outputVector, offset, object);
222+
}
223+
224+
void NumpyScan::scanObjectColumn(
225+
PyObject** col, uint64_t count, uint64_t offset, common::ValueVector* outputVector) {
226+
py::gil_scoped_acquire gil;
227+
auto srcPtr = col + offset;
228+
for (auto i = 0u; i < count; i++) {
229+
scanNumpyObject(srcPtr[i], i, outputVector);
230+
}
231+
}
232+
201233
} // namespace kuzu

src_cpp/pandas/pandas_analyzer.cpp

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#include "pandas/pandas_analyzer.h"
2+
3+
#include "function/built_in_function.h"
4+
#include "py_conversion.h"
5+
6+
namespace kuzu {
7+
8+
static bool upgradeType(common::LogicalType& left, const common::LogicalType& right) {
9+
if (right.getLogicalTypeID() == common::LogicalTypeID::ANY) {
10+
return true;
11+
}
12+
if (left.getLogicalTypeID() == common::LogicalTypeID::ANY ||
13+
((left.getLogicalTypeID() == common::LogicalTypeID::VAR_LIST) &&
14+
(common::VarListType::getChildType(&left)->getLogicalTypeID() ==
15+
common::LogicalTypeID::ANY))) {
16+
left = right;
17+
return true;
18+
}
19+
if (((right.getLogicalTypeID() == common::LogicalTypeID::VAR_LIST) &&
20+
(common::VarListType::getChildType(&right)->getLogicalTypeID() ==
21+
common::LogicalTypeID::ANY))) {
22+
return true;
23+
}
24+
auto leftToRightCost =
25+
function::BuiltInFunctions::getCastCost(left.getLogicalTypeID(), right.getLogicalTypeID());
26+
if (leftToRightCost != common::UNDEFINED_CAST_COST) {
27+
left = right;
28+
} else {
29+
return false;
30+
}
31+
return true;
32+
}
33+
34+
common::LogicalType PandasAnalyzer::getListType(py::object& ele, bool& canConvert) {
35+
uint64_t i = 0;
36+
common::LogicalType listType;
37+
for (auto pyVal : ele) {
38+
auto object = py::reinterpret_borrow<py::object>(pyVal);
39+
auto itemType = getItemType(object, canConvert);
40+
if (i != 0) {
41+
listType = itemType;
42+
} else {
43+
if (!upgradeType(listType, itemType)) {
44+
canConvert = false;
45+
}
46+
}
47+
if (!canConvert) {
48+
break;
49+
}
50+
i++;
51+
}
52+
return listType;
53+
}
54+
55+
common::LogicalType PandasAnalyzer::getItemType(py::object ele, bool& canConvert) {
56+
auto objectType = getPythonObjectType(ele);
57+
switch (objectType) {
58+
case PythonObjectType::None:
59+
return *common::LogicalType::ANY();
60+
case PythonObjectType::Bool:
61+
return *common::LogicalType::BOOL();
62+
case PythonObjectType::Integer:
63+
return *common::LogicalType::INT64();
64+
case PythonObjectType::Float:
65+
return *common::LogicalType::DOUBLE();
66+
case PythonObjectType::Datetime:
67+
return *common::LogicalType::TIMESTAMP();
68+
case PythonObjectType::Date:
69+
return *common::LogicalType::DATE();
70+
case PythonObjectType::String:
71+
return *common::LogicalType::STRING();
72+
case PythonObjectType::List:
73+
return *common::LogicalType::VAR_LIST(getListType(ele, canConvert));
74+
default:
75+
KU_UNREACHABLE;
76+
}
77+
}
78+
79+
static py::object findFirstNonNull(const py::handle& row, uint64_t numRows) {
80+
for (auto i = 0u; i < numRows; i++) {
81+
auto obj = row(i);
82+
if (!obj.is_none()) {
83+
return obj;
84+
}
85+
}
86+
return py::none();
87+
}
88+
89+
common::LogicalType PandasAnalyzer::innerAnalyze(py::object column, bool& canConvert) {
90+
auto numRows = py::len(column);
91+
auto pandasModule = py::module::import("pandas");
92+
auto pandasSeries = pandasModule.attr("core").attr("series").attr("Series");
93+
94+
if (py::isinstance(column, pandasSeries)) {
95+
column = column.attr("__array__")();
96+
}
97+
auto row = column.attr("__getitem__");
98+
common::LogicalType itemType = getItemType(findFirstNonNull(row, numRows), canConvert);
99+
for (auto i = 1u; i < numRows; i += 1) {
100+
auto obj = row(i);
101+
auto curItemType = getItemType(obj, canConvert);
102+
if (!canConvert || !upgradeType(itemType, curItemType)) {
103+
canConvert = false;
104+
return curItemType;
105+
}
106+
}
107+
if (itemType.getPhysicalType() == common::PhysicalTypeID::ANY) {
108+
canConvert = false;
109+
}
110+
return itemType;
111+
}
112+
113+
bool PandasAnalyzer::analyze(py::object column) {
114+
bool canConvert = true;
115+
auto type = innerAnalyze(std::move(column), canConvert);
116+
if (canConvert) {
117+
analyzedType = type;
118+
}
119+
return canConvert;
120+
}
121+
122+
} // namespace kuzu

src_cpp/pandas/pandas_bind.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "pandas/pandas_bind.h"
22

33
#include "common/exception/runtime.h"
4+
#include "pandas/pandas_analyzer.h"
45

56
namespace kuzu {
67

@@ -40,8 +41,8 @@ struct PandasDataFrameBind {
4041
py::object getter;
4142
};
4243

43-
static common::LogicalType bindColumn(PandasBindColumn& bindColumn,
44-
PandasColumnBindData* bindData) {
44+
static common::LogicalType bindColumn(
45+
PandasBindColumn& bindColumn, PandasColumnBindData* bindData) {
4546
common::LogicalType columnType;
4647
auto& column = bindColumn.handle;
4748

@@ -75,13 +76,18 @@ static common::LogicalType bindColumn(PandasBindColumn& bindColumn,
7576
}
7677
columnType = *NumpyTypeUtils::numpyToLogicalType(bindData->npType);
7778
}
79+
if (bindData->npType.type == NumpyNullableType::OBJECT) {
80+
PandasAnalyzer analyzer;
81+
if (analyzer.analyze(column)) {
82+
columnType = analyzer.getAnalyzedType();
83+
}
84+
}
7885
return columnType;
7986
}
8087

8188
void Pandas::bind(py::handle dfToBind,
8289
std::vector<std::unique_ptr<PandasColumnBindData>>& columnBindData,
83-
std::vector<common::LogicalType>& returnTypes,
84-
std::vector<std::string>& names) {
90+
std::vector<common::LogicalType>& returnTypes, std::vector<std::string>& names) {
8591

8692
PandasDataFrameBind df(dfToBind);
8793
auto numColumns = py::len(df.names);

0 commit comments

Comments
 (0)