Skip to content
Prev Previous commit
Next Next commit
fix file upload
  • Loading branch information
mfeurer committed Nov 2, 2020
commit 0babe6ab48c537e0487e248388cc046e958cc49c
17 changes: 9 additions & 8 deletions openml/datasets/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ def find_invalid_characters(string, pattern):
)

self.cache_format = cache_format
self.data_format = data_format
# Has to be called format, otherwise there will be an XML upload error
self.format = data_format
self.creator = creator
self.contributor = contributor
self.collection_date = collection_date
Expand Down Expand Up @@ -244,7 +245,7 @@ def _get_repr_body_fields(self) -> List[Tuple[str, Union[str, int, List[str]]]]:
fields = {
"Name": self.name,
"Version": self.version,
"Format": self.data_format,
"Format": self.format,
"Licence": self.licence,
"Download URL": self.url,
"Data file": self.data_file,
Expand Down Expand Up @@ -377,7 +378,7 @@ def _parse_data_from_arff(
List[str]: List of column names.
"""
try:
data = self._get_arff(self.data_format)
data = self._get_arff(self.format)
except OSError as e:
logger.critical(
"Please check that the data file {} is "
Expand All @@ -398,7 +399,7 @@ def _parse_data_from_arff(
for i, (name, type_) in enumerate(data["attributes"]):
# if the feature is nominal and a sparse matrix is
# requested, the categories need to be numeric
if isinstance(type_, list) and self.data_format.lower() == "sparse_arff":
if isinstance(type_, list) and self.format.lower() == "sparse_arff":
try:
# checks if the strings which should be the class labels
# can be encoded into integers
Expand All @@ -408,7 +409,7 @@ def _parse_data_from_arff(
"Categorical data needs to be numeric when " "using sparse ARFF."
)
# string can only be supported with pandas DataFrame
elif type_ == "STRING" and self.data_format.lower() == "sparse_arff":
elif type_ == "STRING" and self.format.lower() == "sparse_arff":
raise ValueError("Dataset containing strings is not supported " "with sparse ARFF.")

# infer the dtype from the ARFF header
Expand All @@ -431,12 +432,12 @@ def _parse_data_from_arff(
attribute_dtype[name] = ARFF_DTYPES_TO_PD_DTYPE[type_]
attribute_names.append(name)

if self.data_format.lower() == "sparse_arff":
if self.format.lower() == "sparse_arff":
X = data["data"]
X_shape = (max(X[1]) + 1, max(X[2]) + 1)
X = scipy.sparse.coo_matrix((X[0], (X[1], X[2])), shape=X_shape, dtype=np.float32)
X = X.tocsr()
elif self.data_format.lower() == "arff":
elif self.format.lower() == "arff":
X = pd.DataFrame(data["data"], columns=attribute_names)

col = []
Expand All @@ -460,7 +461,7 @@ def _parse_data_from_arff(
col.append(X[column_name])
X = pd.concat(col, axis=1)
else:
raise ValueError("Dataset format '{}' is not a valid format.".format(self.data_format))
raise ValueError("Dataset format '{}' is not a valid format.".format(self.format))

return X, categorical, attribute_names

Expand Down
4 changes: 2 additions & 2 deletions tests/test_datasets/test_dataset_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ def test_get_online_dataset_arff(self):
decoder = arff.ArffDecoder()
# check if the arff from the dataset is
# the same as the arff from _get_arff function
d_format = (dataset.data_format).lower()
d_format = (dataset.format).lower()

self.assertEqual(
dataset._get_arff(d_format),
Expand All @@ -879,7 +879,7 @@ def test_get_online_dataset_format(self):
dataset = openml.datasets.get_dataset(dataset_id, download_data=False)

self.assertEqual(
(dataset.data_format).lower(),
(dataset.format).lower(),
_get_online_dataset_format(dataset_id),
"The format of the ARFF files is different",
)
Expand Down